Spaces:

HorizonRobotics
/

EmbodiedGen-Text-to-3D

Running on Zero

App Files Files Community

xinjie.wang commited on 7 days ago

Commit

8131b67

1 Parent(s): 33d9f9a

update

Browse files

Files changed (28) hide show

common.py +4 -4
embodied_gen/data/backproject_v2.py +8 -4
embodied_gen/data/datasets.py +65 -1
embodied_gen/data/differentiable_render.py +8 -3
embodied_gen/data/mesh_operator.py +2 -0
embodied_gen/data/utils.py +2 -15
embodied_gen/models/image_comm_model.py +236 -0
embodied_gen/models/text_model.py +7 -2
embodied_gen/scripts/gen_scene3d.py +191 -0
embodied_gen/scripts/imageto3d.py +107 -91
embodied_gen/scripts/text2image.py +3 -9
embodied_gen/scripts/textto3d.py +280 -0
embodied_gen/scripts/textto3d.sh +43 -6
embodied_gen/scripts/texture_gen.sh +3 -6
embodied_gen/trainer/gsplat_trainer.py +678 -0
embodied_gen/trainer/pono2mesh_trainer.py +538 -0
embodied_gen/utils/config.py +190 -0
embodied_gen/utils/enum.py +107 -0
embodied_gen/utils/gaussian.py +331 -0
embodied_gen/utils/gpt_clients.py +47 -37
embodied_gen/utils/log.py +48 -0
embodied_gen/utils/monkey_patches.py +152 -0
embodied_gen/utils/process_media.py +228 -90
embodied_gen/utils/tags.py +1 -1
embodied_gen/utils/trender.py +90 -0
embodied_gen/validators/aesthetic_predictor.py +1 -13
embodied_gen/validators/quality_checkers.py +410 -72
embodied_gen/validators/urdf_convertor.py +52 -42

common.py CHANGED Viewed

@@ -55,9 +55,9 @@ from embodied_gen.utils.gpt_clients import GPT_CLIENT
 from embodied_gen.utils.process_media import (
     filter_image_small_connected_components,
     merge_images_video,
-    render_video,
 )
 from embodied_gen.utils.tags import VERSION
 from embodied_gen.validators.quality_checkers import (
     BaseChecker,
     ImageAestheticChecker,
@@ -94,9 +94,6 @@ os.environ["GRADIO_ANALYTICS_ENABLED"] = "false"
 os.environ["SPCONV_ALGO"] = "native"
 MAX_SEED = 100000
-DELIGHT = DelightingModel()
-IMAGESR_MODEL = ImageRealESRGAN(outscale=4)
-# IMAGESR_MODEL = ImageStableSR()
 def patched_setup_functions(self):
@@ -136,6 +133,9 @@ def patched_setup_functions(self):
 Gaussian.setup_functions = patched_setup_functions
 if os.getenv("GRADIO_APP") == "imageto3d":
     RBG_REMOVER = RembgRemover()
     RBG14_REMOVER = BMGG14Remover()

 from embodied_gen.utils.process_media import (
     filter_image_small_connected_components,
     merge_images_video,
 )
 from embodied_gen.utils.tags import VERSION
+from embodied_gen.utils.trender import render_video
 from embodied_gen.validators.quality_checkers import (
     BaseChecker,
     ImageAestheticChecker,
 os.environ["SPCONV_ALGO"] = "native"
 MAX_SEED = 100000
 def patched_setup_functions(self):
 Gaussian.setup_functions = patched_setup_functions
+DELIGHT = DelightingModel()
+IMAGESR_MODEL = ImageRealESRGAN(outscale=4)
+# IMAGESR_MODEL = ImageStableSR()
 if os.getenv("GRADIO_APP") == "imageto3d":
     RBG_REMOVER = RembgRemover()
     RBG14_REMOVER = BMGG14Remover()

embodied_gen/data/backproject_v2.py CHANGED Viewed

@@ -251,6 +251,7 @@ class TextureBacker:
             during rendering. Defaults to 0.5.
         smooth_texture (bool, optional): If True, apply post-processing (e.g.,
             blurring) to the final texture. Defaults to True.
     """
     def __init__(
@@ -262,6 +263,7 @@ class TextureBacker:
         bake_angle_thresh: int = 75,
         mask_thresh: float = 0.5,
         smooth_texture: bool = True,
     ) -> None:
         self.camera_params = camera_params
         self.renderer = None
@@ -271,6 +273,7 @@ class TextureBacker:
         self.texture_wh = texture_wh
         self.mask_thresh = mask_thresh
         self.smooth_texture = smooth_texture
         self.bake_angle_thresh = bake_angle_thresh
         self.bake_unreliable_kernel_size = int(
@@ -446,11 +449,12 @@ class TextureBacker:
     def uv_inpaint(
         self, mesh: trimesh.Trimesh, texture: np.ndarray, mask: np.ndarray
     ) -> np.ndarray:
-        vertices, faces, uv_map = self.get_mesh_np_attrs(mesh)
-        texture, mask = _texture_inpaint_smooth(
-            texture, mask, vertices, faces, uv_map
-        )
         texture = texture.clip(0, 1)
         texture = cv2.inpaint(
             (texture * 255).astype(np.uint8),

             during rendering. Defaults to 0.5.
         smooth_texture (bool, optional): If True, apply post-processing (e.g.,
             blurring) to the final texture. Defaults to True.
+        inpaint_smooth (bool, optional): If True, apply inpainting to smooth.
     """
     def __init__(
         bake_angle_thresh: int = 75,
         mask_thresh: float = 0.5,
         smooth_texture: bool = True,
+        inpaint_smooth: bool = False,
     ) -> None:
         self.camera_params = camera_params
         self.renderer = None
         self.texture_wh = texture_wh
         self.mask_thresh = mask_thresh
         self.smooth_texture = smooth_texture
+        self.inpaint_smooth = inpaint_smooth
         self.bake_angle_thresh = bake_angle_thresh
         self.bake_unreliable_kernel_size = int(
     def uv_inpaint(
         self, mesh: trimesh.Trimesh, texture: np.ndarray, mask: np.ndarray
     ) -> np.ndarray:
+        if self.inpaint_smooth:
+            vertices, faces, uv_map = self.get_mesh_np_attrs(mesh)
+            texture, mask = _texture_inpaint_smooth(
+                texture, mask, vertices, faces, uv_map
+            )
         texture = texture.clip(0, 1)
         texture = cv2.inpaint(
             (texture * 255).astype(np.uint8),

embodied_gen/data/datasets.py CHANGED Viewed

@@ -19,8 +19,9 @@ import json
 import logging
 import os
 import random
-from typing import Any, Callable, Dict, List, Tuple
 import torch
 import torch.utils.checkpoint
 from PIL import Image
@@ -36,6 +37,7 @@ logger = logging.getLogger(__name__)
 __all__ = [
     "Asset3dGenDataset",
 ]
@@ -222,6 +224,68 @@ class Asset3dGenDataset(Dataset):
         return data
 if __name__ == "__main__":
     index_file = "datasets/objaverse/v1.0/statistics_1.0_gobjaverse_filter/view6s_v4/meta_ac2e0ddea8909db26d102c8465b5bcb2.json"  # noqa
     target_hw = (512, 512)

 import logging
 import os
 import random
+from typing import Any, Callable, Dict, List, Literal, Tuple
+import numpy as np
 import torch
 import torch.utils.checkpoint
 from PIL import Image
 __all__ = [
     "Asset3dGenDataset",
+    "PanoGSplatDataset",
 ]
         return data
+class PanoGSplatDataset(Dataset):
+    """A PyTorch Dataset for loading panorama-based 3D Gaussian Splatting data.
+    This dataset is designed to be compatible with train and eval pipelines
+    that use COLMAP-style camera conventions.
+    Args:
+        data_dir (str): Root directory where the dataset file is located.
+        split (str): Dataset split to use, either "train" or "eval".
+        data_name (str, optional): Name of the dataset file (default: "gs_data.pt").
+        max_sample_num (int, optional): Maximum number of samples to load. If None,
+            all available samples in the split will be used.
+    """
+    def __init__(
+        self,
+        data_dir: str,
+        split: str = Literal["train", "eval"],
+        data_name: str = "gs_data.pt",
+        max_sample_num: int = None,
+    ) -> None:
+        self.data_path = os.path.join(data_dir, data_name)
+        self.split = split
+        self.max_sample_num = max_sample_num
+        if not os.path.exists(self.data_path):
+            raise FileNotFoundError(
+                f"Dataset file {self.data_path} not found. Please provide the correct path."
+            )
+        self.data = torch.load(self.data_path, weights_only=False)
+        self.frames = self.data[split]
+        if max_sample_num is not None:
+            self.frames = self.frames[:max_sample_num]
+        self.points = self.data.get("points", None)
+        self.points_rgb = self.data.get("points_rgb", None)
+    def __len__(self) -> int:
+        return len(self.frames)
+    def cvt_blender_to_colmap_coord(self, c2w: np.ndarray) -> np.ndarray:
+        # change from OpenGL/Blender camera axes (Y up, Z back) to COLMAP (Y down, Z forward)
+        tranformed_c2w = np.copy(c2w)
+        tranformed_c2w[:3, 1:3] *= -1
+        return tranformed_c2w
+    def __getitem__(self, index: int) -> dict[str, any]:
+        data = self.frames[index]
+        c2w = self.cvt_blender_to_colmap_coord(data["camtoworld"])
+        item = dict(
+            camtoworld=c2w,
+            K=data["K"],
+            image_h=data["image_h"],
+            image_w=data["image_w"],
+        )
+        if "image" in data:
+            item["image"] = data["image"]
+        if "image_id" in data:
+            item["image_id"] = data["image_id"]
+        return item
 if __name__ == "__main__":
     index_file = "datasets/objaverse/v1.0/statistics_1.0_gobjaverse_filter/view6s_v4/meta_ac2e0ddea8909db26d102c8465b5bcb2.json"  # noqa
     target_hw = (512, 512)

embodied_gen/data/differentiable_render.py CHANGED Viewed

@@ -33,7 +33,6 @@ from tqdm import tqdm
 from embodied_gen.data.utils import (
     CameraSetting,
     DiffrastRender,
-    RenderItems,
     as_list,
     calc_vertex_normals,
     import_kaolin_mesh,
@@ -42,6 +41,7 @@ from embodied_gen.data.utils import (
     render_pbr,
     save_images,
 )
 os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
 os.environ["TORCH_EXTENSIONS_DIR"] = os.path.expanduser(
@@ -470,7 +470,7 @@ def parse_args():
         "--pbr_light_factor",
         type=float,
         default=1.0,
-        help="Light factor for mesh PBR rendering (default: 2.)",
     )
     parser.add_argument(
         "--with_mtl",
@@ -482,6 +482,11 @@ def parse_args():
         action="store_true",
         help="Whether to generate color .gif rendering file.",
     )
     parser.add_argument(
         "--gen_color_mp4",
         action="store_true",
@@ -568,7 +573,7 @@ def entrypoint(**kwargs) -> None:
         gen_viewnormal_mp4=args.gen_viewnormal_mp4,
         gen_glonormal_mp4=args.gen_glonormal_mp4,
         light_factor=args.pbr_light_factor,
-        no_index_file=gen_video,
     )
     image_render.render_mesh(
         mesh_path=args.mesh_path,

 from embodied_gen.data.utils import (
     CameraSetting,
     DiffrastRender,
     as_list,
     calc_vertex_normals,
     import_kaolin_mesh,
     render_pbr,
     save_images,
 )
+from embodied_gen.utils.enum import RenderItems
 os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
 os.environ["TORCH_EXTENSIONS_DIR"] = os.path.expanduser(
         "--pbr_light_factor",
         type=float,
         default=1.0,
+        help="Light factor for mesh PBR rendering (default: 1.)",
     )
     parser.add_argument(
         "--with_mtl",
         action="store_true",
         help="Whether to generate color .gif rendering file.",
     )
+    parser.add_argument(
+        "--no_index_file",
+        action="store_true",
+        help="Whether skip the index file saving.",
+    )
     parser.add_argument(
         "--gen_color_mp4",
         action="store_true",
         gen_viewnormal_mp4=args.gen_viewnormal_mp4,
         gen_glonormal_mp4=args.gen_glonormal_mp4,
         light_factor=args.pbr_light_factor,
+        no_index_file=gen_video or args.no_index_file,
     )
     image_render.render_mesh(
         mesh_path=args.mesh_path,

embodied_gen/data/mesh_operator.py CHANGED Viewed

@@ -395,6 +395,8 @@ class MeshFixer(object):
             self.vertices_np,
             np.hstack([np.full((self.faces.shape[0], 1), 3), self.faces_np]),
         )
         mesh = mesh.decimate(ratio, progress_bar=True)
         # Update vertices and faces

             self.vertices_np,
             np.hstack([np.full((self.faces.shape[0], 1), 3), self.faces_np]),
         )
+        mesh.clean(inplace=True)
+        mesh.clear_data()
         mesh = mesh.decimate(ratio, progress_bar=True)
         # Update vertices and faces

embodied_gen/data/utils.py CHANGED Viewed

@@ -38,7 +38,6 @@ except ImportError:
     ChatGLMModel = None
 import logging
 from dataclasses import dataclass, field
-from enum import Enum
 import trimesh
 from kaolin.render.camera import Camera
@@ -57,7 +56,6 @@ __all__ = [
     "load_mesh_to_unit_cube",
     "as_list",
     "CameraSetting",
-    "RenderItems",
     "import_kaolin_mesh",
     "save_mesh_with_mtl",
     "get_images_from_grid",
@@ -160,8 +158,9 @@ class DiffrastRender(object):
         return normalized_maps
     def normalize_map_by_mask(
-        self, map: torch.Tensor, mask: torch.Tensor
     ) -> torch.Tensor:
         # Normalize all maps in total by mask, normalized map in [0, 1].
         foreground = (mask == 1).squeeze(dim=-1)
@@ -738,18 +737,6 @@ class CameraSetting:
         self.Ks = Ks
-@dataclass
-class RenderItems(str, Enum):
-    IMAGE = "image_color"
-    ALPHA = "image_mask"
-    VIEW_NORMAL = "image_view_normal"
-    GLOBAL_NORMAL = "image_global_normal"
-    POSITION_MAP = "image_position"
-    DEPTH = "image_depth"
-    ALBEDO = "image_albedo"
-    DIFFUSE = "image_diffuse"
 def _compute_az_el_by_camera_params(
     camera_params: CameraSetting, flip_az: bool = False
 ):

     ChatGLMModel = None
 import logging
 from dataclasses import dataclass, field
 import trimesh
 from kaolin.render.camera import Camera
     "load_mesh_to_unit_cube",
     "as_list",
     "CameraSetting",
     "import_kaolin_mesh",
     "save_mesh_with_mtl",
     "get_images_from_grid",
         return normalized_maps
+    @staticmethod
     def normalize_map_by_mask(
+        map: torch.Tensor, mask: torch.Tensor
     ) -> torch.Tensor:
         # Normalize all maps in total by mask, normalized map in [0, 1].
         foreground = (mask == 1).squeeze(dim=-1)
         self.Ks = Ks
 def _compute_az_el_by_camera_params(
     camera_params: CameraSetting, flip_az: bool = False
 ):

embodied_gen/models/image_comm_model.py ADDED Viewed

	@@ -0,0 +1,236 @@

+# Project EmbodiedGen
+#
+# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+# Text-to-Image generation models from Hugging Face community.
+import os
+from abc import ABC, abstractmethod
+import torch
+from diffusers import (
+    ChromaPipeline,
+    Cosmos2TextToImagePipeline,
+    DPMSolverMultistepScheduler,
+    FluxPipeline,
+    KolorsPipeline,
+    StableDiffusion3Pipeline,
+)
+from diffusers.quantizers import PipelineQuantizationConfig
+from huggingface_hub import snapshot_download
+from PIL import Image
+from transformers import AutoModelForCausalLM, SiglipProcessor
+__all__ = [
+    "build_hf_image_pipeline",
+]
+class BasePipelineLoader(ABC):
+    def __init__(self, device="cuda"):
+        self.device = device
+    @abstractmethod
+    def load(self):
+        pass
+class BasePipelineRunner(ABC):
+    def __init__(self, pipe):
+        self.pipe = pipe
+    @abstractmethod
+    def run(self, prompt: str, **kwargs) -> Image.Image:
+        pass
+# ===== SD3.5-medium =====
+class SD35Loader(BasePipelineLoader):
+    def load(self):
+        pipe = StableDiffusion3Pipeline.from_pretrained(
+            "stabilityai/stable-diffusion-3.5-medium",
+            torch_dtype=torch.float16,
+        )
+        pipe = pipe.to(self.device)
+        pipe.enable_model_cpu_offload()
+        pipe.enable_xformers_memory_efficient_attention()
+        pipe.enable_attention_slicing()
+        return pipe
+class SD35Runner(BasePipelineRunner):
+    def run(self, prompt: str, **kwargs) -> Image.Image:
+        return self.pipe(prompt=prompt, **kwargs).images
+# ===== Cosmos2 =====
+class CosmosLoader(BasePipelineLoader):
+    def __init__(
+        self,
+        model_id="nvidia/Cosmos-Predict2-2B-Text2Image",
+        local_dir="weights/cosmos2",
+        device="cuda",
+    ):
+        super().__init__(device)
+        self.model_id = model_id
+        self.local_dir = local_dir
+    def _patch(self):
+        def patch_model(cls):
+            orig = cls.from_pretrained
+            def new(*args, **kwargs):
+                kwargs.setdefault("attn_implementation", "flash_attention_2")
+                kwargs.setdefault("torch_dtype", torch.bfloat16)
+                return orig(*args, **kwargs)
+            cls.from_pretrained = new
+        def patch_processor(cls):
+            orig = cls.from_pretrained
+            def new(*args, **kwargs):
+                kwargs.setdefault("use_fast", True)
+                return orig(*args, **kwargs)
+            cls.from_pretrained = new
+        patch_model(AutoModelForCausalLM)
+        patch_processor(SiglipProcessor)
+    def load(self):
+        self._patch()
+        snapshot_download(
+            repo_id=self.model_id,
+            local_dir=self.local_dir,
+            local_dir_use_symlinks=False,
+            resume_download=True,
+        )
+        config = PipelineQuantizationConfig(
+            quant_backend="bitsandbytes_4bit",
+            quant_kwargs={
+                "load_in_4bit": True,
+                "bnb_4bit_quant_type": "nf4",
+                "bnb_4bit_compute_dtype": torch.bfloat16,
+                "bnb_4bit_use_double_quant": True,
+            },
+            components_to_quantize=["text_encoder", "transformer", "unet"],
+        )
+        pipe = Cosmos2TextToImagePipeline.from_pretrained(
+            self.model_id,
+            torch_dtype=torch.bfloat16,
+            quantization_config=config,
+            use_safetensors=True,
+            safety_checker=None,
+            requires_safety_checker=False,
+        ).to(self.device)
+        return pipe
+class CosmosRunner(BasePipelineRunner):
+    def run(self, prompt: str, negative_prompt=None, **kwargs) -> Image.Image:
+        return self.pipe(
+            prompt=prompt, negative_prompt=negative_prompt, **kwargs
+        ).images
+# ===== Kolors =====
+class KolorsLoader(BasePipelineLoader):
+    def load(self):
+        pipe = KolorsPipeline.from_pretrained(
+            "Kwai-Kolors/Kolors-diffusers",
+            torch_dtype=torch.float16,
+            variant="fp16",
+        ).to(self.device)
+        pipe.enable_model_cpu_offload()
+        pipe.enable_xformers_memory_efficient_attention()
+        pipe.scheduler = DPMSolverMultistepScheduler.from_config(
+            pipe.scheduler.config, use_karras_sigmas=True
+        )
+        return pipe
+class KolorsRunner(BasePipelineRunner):
+    def run(self, prompt: str, **kwargs) -> Image.Image:
+        return self.pipe(prompt=prompt, **kwargs).images
+# ===== Flux =====
+class FluxLoader(BasePipelineLoader):
+    def load(self):
+        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
+        pipe = FluxPipeline.from_pretrained(
+            "black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.enable_xformers_memory_efficient_attention()
+        pipe.enable_attention_slicing()
+        return pipe.to(self.device)
+class FluxRunner(BasePipelineRunner):
+    def run(self, prompt: str, **kwargs) -> Image.Image:
+        return self.pipe(prompt=prompt, **kwargs).images
+# ===== Chroma =====
+class ChromaLoader(BasePipelineLoader):
+    def load(self):
+        return ChromaPipeline.from_pretrained(
+            "lodestones/Chroma", torch_dtype=torch.bfloat16
+        ).to(self.device)
+class ChromaRunner(BasePipelineRunner):
+    def run(self, prompt: str, negative_prompt=None, **kwargs) -> Image.Image:
+        return self.pipe(
+            prompt=prompt, negative_prompt=negative_prompt, **kwargs
+        ).images
+PIPELINE_REGISTRY = {
+    "sd35": (SD35Loader, SD35Runner),
+    "cosmos": (CosmosLoader, CosmosRunner),
+    "kolors": (KolorsLoader, KolorsRunner),
+    "flux": (FluxLoader, FluxRunner),
+    "chroma": (ChromaLoader, ChromaRunner),
+}
+def build_hf_image_pipeline(name: str, device="cuda") -> BasePipelineRunner:
+    if name not in PIPELINE_REGISTRY:
+        raise ValueError(f"Unsupported model: {name}")
+    loader_cls, runner_cls = PIPELINE_REGISTRY[name]
+    pipe = loader_cls(device=device).load()
+    return runner_cls(pipe)
+if __name__ == "__main__":
+    model_name = "sd35"
+    runner = build_hf_image_pipeline(model_name)
+    # NOTE: Just for pipeline testing, generation quality at low resolution is poor.
+    images = runner.run(
+        prompt="A robot holding a sign that says 'Hello'",
+        height=512,
+        width=512,
+        num_inference_steps=10,
+        guidance_scale=6,
+        num_images_per_prompt=1,
+    )
+    for i, img in enumerate(images):
+        img.save(f"image_{model_name}_{i}.jpg")

embodied_gen/models/text_model.py CHANGED Viewed

@@ -52,6 +52,12 @@ __all__ = [
     "download_kolors_weights",
 ]
 def download_kolors_weights(local_dir: str = "weights/Kolors") -> None:
     logger.info(f"Download kolors weights from huggingface...")
@@ -179,8 +185,7 @@ def text2img_gen(
     ip_image_size: int = 512,
     seed: int = None,
 ) -> list[Image.Image]:
-    prompt = "Single " + prompt + ", in the center of the image"
-    prompt += ", high quality, high resolution, best quality, white background, 3D style"  # noqa
     logger.info(f"Processing prompt: {prompt}")
     generator = None

     "download_kolors_weights",
 ]
+PROMPT_APPEND = (
+    "Angled 3D view of one {object}, centered, no cropping, no occlusion, isolated product photo, "
+    "no surroundings, high-quality appearance, vivid colors, on a plain clean surface, 3D style revealing multiple surfaces"
+)
+PROMPT_KAPPEND = "Single {object}, in the center of the image, white background, 3D style, best quality"
 def download_kolors_weights(local_dir: str = "weights/Kolors") -> None:
     logger.info(f"Download kolors weights from huggingface...")
     ip_image_size: int = 512,
     seed: int = None,
 ) -> list[Image.Image]:
+    prompt = PROMPT_KAPPEND.format(object=prompt.strip())
     logger.info(f"Processing prompt: {prompt}")
     generator = None

embodied_gen/scripts/gen_scene3d.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import logging
+import os
+import random
+import time
+import warnings
+from dataclasses import dataclass, field
+from shutil import copy, rmtree
+import torch
+import tyro
+from huggingface_hub import snapshot_download
+from packaging import version
+# Suppress warnings
+warnings.filterwarnings("ignore", category=FutureWarning)
+logging.getLogger("transformers").setLevel(logging.ERROR)
+logging.getLogger("diffusers").setLevel(logging.ERROR)
+# TorchVision monkey patch for >0.16
+if version.parse(torch.__version__) >= version.parse("0.16"):
+    import sys
+    import types
+    import torchvision.transforms.functional as TF
+    functional_tensor = types.ModuleType(
+        "torchvision.transforms.functional_tensor"
+    )
+    functional_tensor.rgb_to_grayscale = TF.rgb_to_grayscale
+    sys.modules["torchvision.transforms.functional_tensor"] = functional_tensor
+from gsplat.distributed import cli
+from txt2panoimg import Text2360PanoramaImagePipeline
+from embodied_gen.trainer.gsplat_trainer import (
+    DefaultStrategy,
+    GsplatTrainConfig,
+)
+from embodied_gen.trainer.gsplat_trainer import entrypoint as gsplat_entrypoint
+from embodied_gen.trainer.pono2mesh_trainer import Pano2MeshSRPipeline
+from embodied_gen.utils.config import Pano2MeshSRConfig
+from embodied_gen.utils.gaussian import restore_scene_scale_and_position
+from embodied_gen.utils.gpt_clients import GPT_CLIENT
+from embodied_gen.utils.log import logger
+from embodied_gen.utils.process_media import is_image_file, parse_text_prompts
+from embodied_gen.validators.quality_checkers import (
+    PanoHeightEstimator,
+    PanoImageOccChecker,
+)
+__all__ = [
+    "generate_pano_image",
+    "entrypoint",
+]
+@dataclass
+class Scene3DGenConfig:
+    prompts: list[str]  # Text desc of indoor room or style reference image.
+    output_dir: str
+    seed: int | None = None
+    real_height: float | None = None  # The real height of the room in meters.
+    pano_image_only: bool = False
+    disable_pano_check: bool = False
+    keep_middle_result: bool = False
+    n_retry: int = 7
+    gs3d: GsplatTrainConfig = field(
+        default_factory=lambda: GsplatTrainConfig(
+            strategy=DefaultStrategy(verbose=True),
+            max_steps=4000,
+            init_opa=0.9,
+            opacity_reg=2e-3,
+            sh_degree=0,
+            means_lr=1e-4,
+            scales_lr=1e-3,
+        )
+    )
+def generate_pano_image(
+    prompt: str,
+    output_path: str,
+    pipeline,
+    seed: int,
+    n_retry: int,
+    checker=None,
+    num_inference_steps: int = 40,
+) -> None:
+    for i in range(n_retry):
+        logger.info(
+            f"GEN Panorama: Retry {i+1}/{n_retry} for prompt: {prompt}, seed: {seed}"
+        )
+        if is_image_file(prompt):
+            raise NotImplementedError("Image mode not implemented yet.")
+        else:
+            txt_prompt = f"{prompt}, spacious, empty, wide open, open floor, minimal furniture"
+            inputs = {
+                "prompt": txt_prompt,
+                "num_inference_steps": num_inference_steps,
+                "upscale": False,
+                "seed": seed,
+            }
+            pano_image = pipeline(inputs)
+        pano_image.save(output_path)
+        if checker is None:
+            break
+        flag, response = checker(pano_image)
+        logger.warning(f"{response}, image saved in {output_path}")
+        if flag is True or flag is None:
+            break
+        seed = random.randint(0, 100000)
+    return
+def entrypoint(*args, **kwargs):
+    cfg = tyro.cli(Scene3DGenConfig)
+    # Init global models.
+    model_path = snapshot_download("archerfmy0831/sd-t2i-360panoimage")
+    IMG2PANO_PIPE = Text2360PanoramaImagePipeline(
+        model_path, torch_dtype=torch.float16, device="cuda"
+    )
+    PANOMESH_CFG = Pano2MeshSRConfig()
+    PANO2MESH_PIPE = Pano2MeshSRPipeline(PANOMESH_CFG)
+    PANO_CHECKER = PanoImageOccChecker(GPT_CLIENT, box_hw=[95, 1000])
+    PANOHEIGHT_ESTOR = PanoHeightEstimator(GPT_CLIENT)
+    prompts = parse_text_prompts(cfg.prompts)
+    for idx, prompt in enumerate(prompts):
+        start_time = time.time()
+        output_dir = os.path.join(cfg.output_dir, f"scene_{idx:04d}")
+        os.makedirs(output_dir, exist_ok=True)
+        pano_path = os.path.join(output_dir, "pano_image.png")
+        with open(f"{output_dir}/prompt.txt", "w") as f:
+            f.write(prompt)
+        generate_pano_image(
+            prompt,
+            pano_path,
+            IMG2PANO_PIPE,
+            cfg.seed if cfg.seed is not None else random.randint(0, 100000),
+            cfg.n_retry,
+            checker=None if cfg.disable_pano_check else PANO_CHECKER,
+        )
+        if cfg.pano_image_only:
+            continue
+        logger.info("GEN and REPAIR Mesh from Panorama...")
+        PANO2MESH_PIPE(pano_path, output_dir)
+        logger.info("TRAIN 3DGS from Mesh Init and Cube Image...")
+        cfg.gs3d.data_dir = output_dir
+        cfg.gs3d.result_dir = f"{output_dir}/gaussian"
+        cfg.gs3d.adjust_steps(cfg.gs3d.steps_scaler)
+        torch.set_default_device("cpu")  # recover default setting.
+        cli(gsplat_entrypoint, cfg.gs3d, verbose=True)
+        # Clean up the middle results.
+        gs_path = (
+            f"{cfg.gs3d.result_dir}/ply/point_cloud_{cfg.gs3d.max_steps-1}.ply"
+        )
+        copy(gs_path, f"{output_dir}/gs_model.ply")
+        video_path = f"{cfg.gs3d.result_dir}/renders/video_step{cfg.gs3d.max_steps-1}.mp4"
+        copy(video_path, f"{output_dir}/video.mp4")
+        gs_cfg_path = f"{cfg.gs3d.result_dir}/cfg.yml"
+        copy(gs_cfg_path, f"{output_dir}/gsplat_cfg.yml")
+        if not cfg.keep_middle_result:
+            rmtree(cfg.gs3d.result_dir, ignore_errors=True)
+            os.remove(f"{output_dir}/{PANOMESH_CFG.gs_data_file}")
+        real_height = (
+            PANOHEIGHT_ESTOR(pano_path)
+            if cfg.real_height is None
+            else cfg.real_height
+        )
+        gs_path = os.path.join(output_dir, "gs_model.ply")
+        mesh_path = os.path.join(output_dir, "mesh_model.ply")
+        restore_scene_scale_and_position(real_height, mesh_path, gs_path)
+        elapsed_time = (time.time() - start_time) / 60
+        logger.info(
+            f"FINISHED 3D scene generation in {output_dir} in {elapsed_time:.2f} mins."
+        )
+if __name__ == "__main__":
+    entrypoint()

embodied_gen/scripts/imageto3d.py CHANGED Viewed

@@ -16,29 +16,28 @@
 import argparse
-import logging
 import os
 import sys
 from glob import glob
 from shutil import copy, copytree, rmtree
 import numpy as np
 import trimesh
 from PIL import Image
 from embodied_gen.data.backproject_v2 import entrypoint as backproject_api
 from embodied_gen.data.utils import delete_dir, trellis_preprocess
 from embodied_gen.models.delight_model import DelightingModel
 from embodied_gen.models.gs_model import GaussianOperator
-from embodied_gen.models.segment_model import (
-    BMGG14Remover,
-    RembgRemover,
-    SAMPredictor,
-)
 from embodied_gen.models.sr_model import ImageRealESRGAN
 from embodied_gen.scripts.render_gs import entrypoint as render_gs_api
 from embodied_gen.utils.gpt_clients import GPT_CLIENT
-from embodied_gen.utils.process_media import merge_images_video, render_video
 from embodied_gen.utils.tags import VERSION
 from embodied_gen.validators.quality_checkers import (
     BaseChecker,
     ImageAestheticChecker,
@@ -52,36 +51,25 @@ current_dir = os.path.dirname(current_file_path)
 sys.path.append(os.path.join(current_dir, "../.."))
 from thirdparty.TRELLIS.trellis.pipelines import TrellisImageTo3DPipeline
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO
-)
-logger = logging.getLogger(__name__)
 os.environ["TORCH_EXTENSIONS_DIR"] = os.path.expanduser(
     "~/.cache/torch_extensions"
 )
 os.environ["GRADIO_ANALYTICS_ENABLED"] = "false"
 os.environ["SPCONV_ALGO"] = "native"
 DELIGHT = DelightingModel()
 IMAGESR_MODEL = ImageRealESRGAN(outscale=4)
 RBG_REMOVER = RembgRemover()
-RBG14_REMOVER = BMGG14Remover()
-SAM_PREDICTOR = SAMPredictor(model_type="vit_h", device="cpu")
 PIPELINE = TrellisImageTo3DPipeline.from_pretrained(
     "microsoft/TRELLIS-image-large"
 )
-PIPELINE.cuda()
 SEG_CHECKER = ImageSegChecker(GPT_CLIENT)
 GEO_CHECKER = MeshGeoChecker(GPT_CLIENT)
 AESTHETIC_CHECKER = ImageAestheticChecker()
 CHECKERS = [GEO_CHECKER, SEG_CHECKER, AESTHETIC_CHECKER]
-TMP_DIR = os.path.join(
-    os.path.dirname(os.path.abspath(__file__)), "sessions/imageto3d"
-)
 def parse_args():
@@ -95,7 +83,6 @@ def parse_args():
     parser.add_argument(
         "--output_root",
         type=str,
-        required=True,
         help="Root directory for saving outputs.",
     )
     parser.add_argument(
@@ -110,12 +97,26 @@ def parse_args():
         default=None,
         help="The mass in kg to restore the mesh real weight.",
     )
-    parser.add_argument("--asset_type", type=str, default=None)
     parser.add_argument("--skip_exists", action="store_true")
-    parser.add_argument("--strict_seg", action="store_true")
     parser.add_argument("--version", type=str, default=VERSION)
-    parser.add_argument("--remove_intermediate", type=bool, default=True)
-    args = parser.parse_args()
     assert (
         args.image_path or args.image_root
@@ -125,13 +126,7 @@ def parse_args():
         args.image_path += glob(os.path.join(args.image_root, "*.jpg"))
         args.image_path += glob(os.path.join(args.image_root, "*.jpeg"))
-    return args
-if __name__ == "__main__":
-    args = parse_args()
-    for image_path in args.image_path:
         try:
             filename = os.path.basename(image_path).split(".")[0]
             output_root = args.output_root
@@ -141,7 +136,7 @@ if __name__ == "__main__":
             mesh_out = f"{output_root}/{filename}.obj"
             if args.skip_exists and os.path.exists(mesh_out):
-                logger.info(
                     f"Skip {image_path}, already processed in {mesh_out}"
                 )
                 continue
@@ -149,67 +144,84 @@ if __name__ == "__main__":
             image = Image.open(image_path)
             image.save(f"{output_root}/{filename}_raw.png")
-            # Segmentation: Get segmented image using SAM or Rembg.
             seg_path = f"{output_root}/{filename}_cond.png"
-            if image.mode != "RGBA":
-                seg_image = RBG_REMOVER(image, save_path=seg_path)
-                seg_image = trellis_preprocess(seg_image)
-            else:
-                seg_image = image
-                seg_image.save(seg_path)
-            # Run the pipeline
-            try:
-                outputs = PIPELINE.run(
-                    seg_image,
-                    preprocess_image=False,
-                    # Optional parameters
-                    # seed=1,
-                    # sparse_structure_sampler_params={
-                    #     "steps": 12,
-                    #     "cfg_strength": 7.5,
-                    # },
-                    # slat_sampler_params={
-                    #     "steps": 12,
-                    #     "cfg_strength": 3,
-                    # },
                 )
-            except Exception as e:
-                logger.error(
-                    f"[Pipeline Failed] process {image_path}: {e}, skip."
                 )
-                continue
-            # Render and save color and mesh videos
-            gs_model = outputs["gaussian"][0]
-            mesh_model = outputs["mesh"][0]
             color_images = render_video(gs_model)["color"]
             normal_images = render_video(mesh_model)["normal"]
             video_path = os.path.join(output_root, "gs_mesh.mp4")
             merge_images_video(color_images, normal_images, video_path)
-            # Save the raw Gaussian model
-            gs_path = mesh_out.replace(".obj", "_gs.ply")
-            gs_model.save_ply(gs_path)
-            # Rotate mesh and GS by 90 degrees around Z-axis.
-            rot_matrix = [[0, 0, -1], [0, 1, 0], [1, 0, 0]]
-            gs_add_rot = [[1, 0, 0], [0, -1, 0], [0, 0, -1]]
-            mesh_add_rot = [[1, 0, 0], [0, 0, -1], [0, 1, 0]]
-            # Addtional rotation for GS to align mesh.
-            gs_rot = np.array(gs_add_rot) @ np.array(rot_matrix)
-            pose = GaussianOperator.trans_to_quatpose(gs_rot)
-            aligned_gs_path = gs_path.replace(".ply", "_aligned.ply")
-            GaussianOperator.resave_ply(
-                in_ply=gs_path,
-                out_ply=aligned_gs_path,
-                instance_pose=pose,
-                device="cpu",
-            )
-            color_path = os.path.join(output_root, "color.png")
-            render_gs_api(aligned_gs_path, color_path)
             mesh = trimesh.Trimesh(
                 vertices=mesh_model.vertices.cpu().numpy(),
                 faces=mesh_model.faces.cpu().numpy(),
@@ -249,8 +261,8 @@ if __name__ == "__main__":
                 min_mass, max_mass = map(float, args.mass_range.split("-"))
                 asset_attrs["min_mass"] = min_mass
                 asset_attrs["max_mass"] = max_mass
-            if args.asset_type:
-                asset_attrs["category"] = args.asset_type
             if args.version:
                 asset_attrs["version"] = args.version
@@ -289,8 +301,8 @@ if __name__ == "__main__":
                     ]
                 images_list.append(images)
-            results = BaseChecker.validate(CHECKERS, images_list)
-            urdf_convertor.add_quality_tag(urdf_path, results)
             # Organize the final result files
             result_dir = f"{output_root}/result"
@@ -303,7 +315,7 @@ if __name__ == "__main__":
                 f"{result_dir}/{urdf_convertor.output_mesh_dir}",
             )
             copy(video_path, f"{result_dir}/video.mp4")
-            if args.remove_intermediate:
                 delete_dir(output_root, keep_subs=["result"])
         except Exception as e:
@@ -311,3 +323,7 @@ if __name__ == "__main__":
             continue
     logger.info(f"Processing complete. Outputs saved to {args.output_root}")

 import argparse
 import os
+import random
 import sys
 from glob import glob
 from shutil import copy, copytree, rmtree
 import numpy as np
+import torch
 import trimesh
 from PIL import Image
 from embodied_gen.data.backproject_v2 import entrypoint as backproject_api
 from embodied_gen.data.utils import delete_dir, trellis_preprocess
 from embodied_gen.models.delight_model import DelightingModel
 from embodied_gen.models.gs_model import GaussianOperator
+from embodied_gen.models.segment_model import RembgRemover
 from embodied_gen.models.sr_model import ImageRealESRGAN
 from embodied_gen.scripts.render_gs import entrypoint as render_gs_api
 from embodied_gen.utils.gpt_clients import GPT_CLIENT
+from embodied_gen.utils.log import logger
+from embodied_gen.utils.process_media import merge_images_video
 from embodied_gen.utils.tags import VERSION
+from embodied_gen.utils.trender import render_video
 from embodied_gen.validators.quality_checkers import (
     BaseChecker,
     ImageAestheticChecker,
 sys.path.append(os.path.join(current_dir, "../.."))
 from thirdparty.TRELLIS.trellis.pipelines import TrellisImageTo3DPipeline
 os.environ["TORCH_EXTENSIONS_DIR"] = os.path.expanduser(
     "~/.cache/torch_extensions"
 )
 os.environ["GRADIO_ANALYTICS_ENABLED"] = "false"
 os.environ["SPCONV_ALGO"] = "native"
+random.seed(0)
+logger.info("Loading Models...")
 DELIGHT = DelightingModel()
 IMAGESR_MODEL = ImageRealESRGAN(outscale=4)
 RBG_REMOVER = RembgRemover()
 PIPELINE = TrellisImageTo3DPipeline.from_pretrained(
     "microsoft/TRELLIS-image-large"
 )
+# PIPELINE.cuda()
 SEG_CHECKER = ImageSegChecker(GPT_CLIENT)
 GEO_CHECKER = MeshGeoChecker(GPT_CLIENT)
 AESTHETIC_CHECKER = ImageAestheticChecker()
 CHECKERS = [GEO_CHECKER, SEG_CHECKER, AESTHETIC_CHECKER]
 def parse_args():
     parser.add_argument(
         "--output_root",
         type=str,
         help="Root directory for saving outputs.",
     )
     parser.add_argument(
         default=None,
         help="The mass in kg to restore the mesh real weight.",
     )
+    parser.add_argument("--asset_type", type=str, nargs="+", default=None)
     parser.add_argument("--skip_exists", action="store_true")
     parser.add_argument("--version", type=str, default=VERSION)
+    parser.add_argument("--keep_intermediate", action="store_true")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument(
+        "--n_retry",
+        type=int,
+        default=2,
+    )
+    args, unknown = parser.parse_known_args()
+    return args
+def entrypoint(**kwargs):
+    args = parse_args()
+    for k, v in kwargs.items():
+        if hasattr(args, k) and v is not None:
+            setattr(args, k, v)
     assert (
         args.image_path or args.image_root
         args.image_path += glob(os.path.join(args.image_root, "*.jpg"))
         args.image_path += glob(os.path.join(args.image_root, "*.jpeg"))
+    for idx, image_path in enumerate(args.image_path):
         try:
             filename = os.path.basename(image_path).split(".")[0]
             output_root = args.output_root
             mesh_out = f"{output_root}/{filename}.obj"
             if args.skip_exists and os.path.exists(mesh_out):
+                logger.warning(
                     f"Skip {image_path}, already processed in {mesh_out}"
                 )
                 continue
             image = Image.open(image_path)
             image.save(f"{output_root}/{filename}_raw.png")
+            # Segmentation: Get segmented image using Rembg.
             seg_path = f"{output_root}/{filename}_cond.png"
+            seg_image = RBG_REMOVER(image) if image.mode != "RGBA" else image
+            seg_image = trellis_preprocess(seg_image)
+            seg_image.save(seg_path)
+            seed = args.seed
+            for try_idx in range(args.n_retry):
+                logger.info(
+                    f"Try: {try_idx + 1}/{args.n_retry}, Seed: {seed}, Prompt: {seg_path}"
                 )
+                # Run the pipeline
+                try:
+                    PIPELINE.cuda()
+                    outputs = PIPELINE.run(
+                        seg_image,
+                        preprocess_image=False,
+                        seed=(
+                            random.randint(0, 100000) if seed is None else seed
+                        ),
+                        # Optional parameters
+                        # sparse_structure_sampler_params={
+                        #     "steps": 12,
+                        #     "cfg_strength": 7.5,
+                        # },
+                        # slat_sampler_params={
+                        #     "steps": 12,
+                        #     "cfg_strength": 3,
+                        # },
+                    )
+                    PIPELINE.cpu()
+                    torch.cuda.empty_cache()
+                except Exception as e:
+                    logger.error(
+                        f"[Pipeline Failed] process {image_path}: {e}, skip."
+                    )
+                    continue
+                gs_model = outputs["gaussian"][0]
+                mesh_model = outputs["mesh"][0]
+                # Save the raw Gaussian model
+                gs_path = mesh_out.replace(".obj", "_gs.ply")
+                gs_model.save_ply(gs_path)
+                # Rotate mesh and GS by 90 degrees around Z-axis.
+                rot_matrix = [[0, 0, -1], [0, 1, 0], [1, 0, 0]]
+                gs_add_rot = [[1, 0, 0], [0, -1, 0], [0, 0, -1]]
+                mesh_add_rot = [[1, 0, 0], [0, 0, -1], [0, 1, 0]]
+                # Addtional rotation for GS to align mesh.
+                gs_rot = np.array(gs_add_rot) @ np.array(rot_matrix)
+                pose = GaussianOperator.trans_to_quatpose(gs_rot)
+                aligned_gs_path = gs_path.replace(".ply", "_aligned.ply")
+                GaussianOperator.resave_ply(
+                    in_ply=gs_path,
+                    out_ply=aligned_gs_path,
+                    instance_pose=pose,
+                    device="cpu",
                 )
+                color_path = os.path.join(output_root, "color.png")
+                render_gs_api(aligned_gs_path, color_path)
+                geo_flag, geo_result = GEO_CHECKER([color_path])
+                logger.warning(
+                    f"{GEO_CHECKER.__class__.__name__}: {geo_result} for {seg_path}"
+                )
+                if geo_flag is True or geo_flag is None:
+                    break
+                seed = random.randint(0, 100000) if seed is not None else None
+            # Render the video for generated 3D asset.
             color_images = render_video(gs_model)["color"]
             normal_images = render_video(mesh_model)["normal"]
             video_path = os.path.join(output_root, "gs_mesh.mp4")
             merge_images_video(color_images, normal_images, video_path)
             mesh = trimesh.Trimesh(
                 vertices=mesh_model.vertices.cpu().numpy(),
                 faces=mesh_model.faces.cpu().numpy(),
                 min_mass, max_mass = map(float, args.mass_range.split("-"))
                 asset_attrs["min_mass"] = min_mass
                 asset_attrs["max_mass"] = max_mass
+            if isinstance(args.asset_type, list) and args.asset_type[idx]:
+                asset_attrs["category"] = args.asset_type[idx]
             if args.version:
                 asset_attrs["version"] = args.version
                     ]
                 images_list.append(images)
+            qa_results = BaseChecker.validate(CHECKERS, images_list)
+            urdf_convertor.add_quality_tag(urdf_path, qa_results)
             # Organize the final result files
             result_dir = f"{output_root}/result"
                 f"{result_dir}/{urdf_convertor.output_mesh_dir}",
             )
             copy(video_path, f"{result_dir}/video.mp4")
+            if not args.keep_intermediate:
                 delete_dir(output_root, keep_subs=["result"])
         except Exception as e:
             continue
     logger.info(f"Processing complete. Outputs saved to {args.output_root}")
+if __name__ == "__main__":
+    entrypoint()

embodied_gen/scripts/text2image.py CHANGED Viewed

@@ -31,6 +31,7 @@ from embodied_gen.models.text_model import (
     build_text2img_pipeline,
     text2img_gen,
 )
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -85,7 +86,7 @@ def parse_args():
     parser.add_argument(
         "--seed",
         type=int,
-        default=0,
     )
     args = parser.parse_args()
@@ -101,14 +102,7 @@ def entrypoint(
         if hasattr(args, k) and v is not None:
             setattr(args, k, v)
-    prompts = args.prompts
-    if len(prompts) == 1 and prompts[0].endswith(".txt"):
-        with open(prompts[0], "r") as f:
-            prompts = f.readlines()
-            prompts = [
-                prompt.strip() for prompt in prompts if prompt.strip() != ""
-            ]
     os.makedirs(args.output_root, exist_ok=True)
     ip_img_paths = args.ref_image

     build_text2img_pipeline,
     text2img_gen,
 )
+from embodied_gen.utils.process_media import parse_text_prompts
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
     parser.add_argument(
         "--seed",
         type=int,
+        default=None,
     )
     args = parser.parse_args()
         if hasattr(args, k) and v is not None:
             setattr(args, k, v)
+    prompts = parse_text_prompts(args.prompts)
     os.makedirs(args.output_root, exist_ok=True)
     ip_img_paths = args.ref_image

embodied_gen/scripts/textto3d.py ADDED Viewed

	@@ -0,0 +1,280 @@

+# Project EmbodiedGen
+#
+# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+import argparse
+import os
+import random
+from collections import defaultdict
+import numpy as np
+import torch
+from PIL import Image
+from embodied_gen.models.image_comm_model import build_hf_image_pipeline
+from embodied_gen.models.segment_model import RembgRemover
+from embodied_gen.models.text_model import PROMPT_APPEND
+from embodied_gen.scripts.imageto3d import entrypoint as imageto3d_api
+from embodied_gen.utils.gpt_clients import GPT_CLIENT
+from embodied_gen.utils.log import logger
+from embodied_gen.utils.process_media import (
+    check_object_edge_truncated,
+    render_asset3d,
+)
+from embodied_gen.validators.quality_checkers import (
+    ImageSegChecker,
+    SemanticConsistChecker,
+    TextGenAlignChecker,
+)
+# Avoid huggingface/tokenizers: The current process just got forked.
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+random.seed(0)
+logger.info("Loading Models...")
+SEMANTIC_CHECKER = SemanticConsistChecker(GPT_CLIENT)
+SEG_CHECKER = ImageSegChecker(GPT_CLIENT)
+TXTGEN_CHECKER = TextGenAlignChecker(GPT_CLIENT)
+PIPE_IMG = build_hf_image_pipeline(os.environ.get("TEXT_MODEL", "sd35"))
+BG_REMOVER = RembgRemover()
+__all__ = [
+    "text_to_image",
+    "text_to_3d",
+]
+def text_to_image(
+    prompt: str,
+    save_path: str,
+    n_retry: int,
+    img_denoise_step: int,
+    text_guidance_scale: float,
+    n_img_sample: int,
+    image_hw: tuple[int, int] = (1024, 1024),
+    seed: int = None,
+) -> bool:
+    select_image = None
+    success_flag = False
+    assert save_path.endswith(".png"), "Image save path must end with `.png`."
+    for try_idx in range(n_retry):
+        if select_image is not None:
+            select_image[0].save(save_path.replace(".png", "_raw.png"))
+            select_image[1].save(save_path)
+            break
+        f_prompt = PROMPT_APPEND.format(object=prompt)
+        logger.info(
+            f"Image GEN for {os.path.basename(save_path)}\n"
+            f"Try: {try_idx + 1}/{n_retry}, Seed: {seed}, Prompt: {f_prompt}"
+        )
+        torch.cuda.empty_cache()
+        images = PIPE_IMG.run(
+            f_prompt,
+            num_inference_steps=img_denoise_step,
+            guidance_scale=text_guidance_scale,
+            num_images_per_prompt=n_img_sample,
+            height=image_hw[0],
+            width=image_hw[1],
+            generator=(
+                torch.Generator().manual_seed(seed)
+                if seed is not None
+                else None
+            ),
+        )
+        for idx in range(len(images)):
+            raw_image: Image.Image = images[idx]
+            image = BG_REMOVER(raw_image)
+            image.save(save_path)
+            semantic_flag, semantic_result = SEMANTIC_CHECKER(
+                prompt, [image.convert("RGB")]
+            )
+            seg_flag, seg_result = SEG_CHECKER(
+                [raw_image, image.convert("RGB")]
+            )
+            image_mask = np.array(image)[..., -1]
+            edge_flag = check_object_edge_truncated(image_mask)
+            logger.warning(
+                f"SEMANTIC: {semantic_result}. SEG: {seg_result}. EDGE: {edge_flag}"
+            )
+            if (
+                (edge_flag and semantic_flag and seg_flag)
+                or (edge_flag and semantic_flag is None)
+                or (edge_flag and seg_flag is None)
+            ):
+                select_image = [raw_image, image]
+                success_flag = True
+                break
+        seed = random.randint(0, 100000) if seed is not None else None
+    return success_flag
+def text_to_3d(**kwargs) -> dict:
+    args = parse_args()
+    for k, v in kwargs.items():
+        if hasattr(args, k) and v is not None:
+            setattr(args, k, v)
+    if args.asset_names is None or len(args.asset_names) == 0:
+        args.asset_names = [f"sample3d_{i}" for i in range(len(args.prompts))]
+    img_save_dir = os.path.join(args.output_root, "images")
+    asset_save_dir = os.path.join(args.output_root, "asset3d")
+    os.makedirs(img_save_dir, exist_ok=True)
+    os.makedirs(asset_save_dir, exist_ok=True)
+    results = defaultdict(dict)
+    for prompt, node in zip(args.prompts, args.asset_names):
+        success_flag = False
+        n_pipe_retry = args.n_pipe_retry
+        seed_img = args.seed_img
+        seed_3d = args.seed_3d
+        while success_flag is False and n_pipe_retry > 0:
+            logger.info(
+                f"GEN pipeline for node {node}\n"
+                f"Try round: {args.n_pipe_retry-n_pipe_retry+1}/{args.n_pipe_retry}, Prompt: {prompt}"
+            )
+            # Text-to-image GEN
+            save_node = node.replace(" ", "_")
+            gen_image_path = f"{img_save_dir}/{save_node}.png"
+            textgen_flag = text_to_image(
+                prompt,
+                gen_image_path,
+                args.n_image_retry,
+                args.img_denoise_step,
+                args.text_guidance_scale,
+                args.n_img_sample,
+                seed=seed_img,
+            )
+            # Asset 3D GEN
+            node_save_dir = f"{asset_save_dir}/{save_node}"
+            asset_type = node if "sample3d_" not in node else None
+            imageto3d_api(
+                image_path=[gen_image_path],
+                output_root=node_save_dir,
+                asset_type=[asset_type],
+                seed=random.randint(0, 100000) if seed_3d is None else seed_3d,
+                n_retry=args.n_asset_retry,
+                keep_intermediate=args.keep_intermediate,
+            )
+            mesh_path = f"{node_save_dir}/result/mesh/{save_node}.obj"
+            image_path = render_asset3d(
+                mesh_path,
+                output_root=f"{node_save_dir}/result",
+                num_images=6,
+                elevation=(30, -30),
+                output_subdir="renders",
+                no_index_file=True,
+            )
+            check_text = asset_type if asset_type is not None else prompt
+            qa_flag, qa_result = TXTGEN_CHECKER(check_text, image_path)
+            logger.warning(
+                f"Node {node}, {TXTGEN_CHECKER.__class__.__name__}: {qa_result}"
+            )
+            results["assets"][node] = f"{node_save_dir}/result"
+            results["quality"][node] = qa_result
+            if qa_flag is None or qa_flag is True:
+                success_flag = True
+                break
+            n_pipe_retry -= 1
+            seed_img = (
+                random.randint(0, 100000) if seed_img is not None else None
+            )
+            seed_3d = (
+                random.randint(0, 100000) if seed_3d is not None else None
+            )
+        torch.cuda.empty_cache()
+    return results
+def parse_args():
+    parser = argparse.ArgumentParser(description="3D Layout Generation Config")
+    parser.add_argument("--prompts", nargs="+", help="text descriptions")
+    parser.add_argument(
+        "--output_root",
+        type=str,
+        help="Directory to save outputs",
+    )
+    parser.add_argument(
+        "--asset_names",
+        type=str,
+        nargs="+",
+        default=None,
+        help="Asset names to generate",
+    )
+    parser.add_argument(
+        "--n_img_sample",
+        type=int,
+        default=3,
+        help="Number of image samples to generate",
+    )
+    parser.add_argument(
+        "--text_guidance_scale",
+        type=float,
+        default=7,
+        help="Text-to-image guidance scale",
+    )
+    parser.add_argument(
+        "--img_denoise_step",
+        type=int,
+        default=25,
+        help="Denoising steps for image generation",
+    )
+    parser.add_argument(
+        "--n_image_retry",
+        type=int,
+        default=2,
+        help="Max retry count for image generation",
+    )
+    parser.add_argument(
+        "--n_asset_retry",
+        type=int,
+        default=2,
+        help="Max retry count for 3D generation",
+    )
+    parser.add_argument(
+        "--n_pipe_retry",
+        type=int,
+        default=1,
+        help="Max retry count for 3D asset generation",
+    )
+    parser.add_argument(
+        "--seed_img",
+        type=int,
+        default=None,
+        help="Random seed for image generation",
+    )
+    parser.add_argument(
+        "--seed_3d",
+        type=int,
+        default=0,
+        help="Random seed for 3D generation",
+    )
+    parser.add_argument("--keep_intermediate", action="store_true")
+    args, unknown = parser.parse_known_args()
+    return args
+if __name__ == "__main__":
+    text_to_3d()

embodied_gen/scripts/textto3d.sh CHANGED Viewed

@@ -2,7 +2,9 @@
 # Initialize variables
 prompts=()
 output_root=""
 # Parse arguments
 while [[ $# -gt 0 ]]; do
@@ -14,10 +16,21 @@ while [[ $# -gt 0 ]]; do
                 shift
             done
             ;;
         --output_root)
             output_root="$2"
             shift 2
             ;;
         *)
             echo "Unknown argument: $1"
             exit 1
@@ -28,7 +41,21 @@ done
 # Validate required arguments
 if [[ ${#prompts[@]} -eq 0 || -z "$output_root" ]]; then
     echo "Missing required arguments."
-    echo "Usage: bash run_text2asset3d.sh --prompts \"Prompt1\" \"Prompt2\" --output_root <path>"
     exit 1
 fi
@@ -37,20 +64,30 @@ echo "Prompts:"
 for p in "${prompts[@]}"; do
     echo "   - $p"
 done
 echo "Output root: ${output_root}"
-# Concatenate prompts for Python command
 prompt_args=""
-for p in "${prompts[@]}"; do
-    prompt_args+="\"$p\" "
 done
 # Step 1: Text-to-Image
 eval python3 embodied_gen/scripts/text2image.py \
     --prompts ${prompt_args} \
-    --output_root "${output_root}/images"
 # Step 2: Image-to-3D
 python3 embodied_gen/scripts/imageto3d.py \
     --image_root "${output_root}/images" \
-    --output_root "${output_root}/asset3d"

 # Initialize variables
 prompts=()
+asset_types=()
 output_root=""
+seed=0
 # Parse arguments
 while [[ $# -gt 0 ]]; do
                 shift
             done
             ;;
+        --asset_types)
+            shift
+            while [[ $# -gt 0 && ! "$1" =~ ^-- ]]; do
+                asset_types+=("$1")
+                shift
+            done
+            ;;
         --output_root)
             output_root="$2"
             shift 2
             ;;
+        --seed)
+            seed="$2"
+            shift 2
+            ;;
         *)
             echo "Unknown argument: $1"
             exit 1
 # Validate required arguments
 if [[ ${#prompts[@]} -eq 0 || -z "$output_root" ]]; then
     echo "Missing required arguments."
+    echo "Usage: bash run_text2asset3d.sh --prompts \"Prompt1\" \"Prompt2\" \
+    --asset_types \"type1\" \"type2\" --seed <seed_value> --output_root <path>"
+    exit 1
+fi
+# If no asset_types provided, default to ""
+if [[ ${#asset_types[@]} -eq 0 ]]; then
+    for (( i=0; i<${#prompts[@]}; i++ )); do
+        asset_types+=("")
+    done
+fi
+# Ensure the number of asset_types matches the number of prompts
+if [[ ${#prompts[@]} -ne ${#asset_types[@]} ]]; then
+    echo "The number of asset types must match the number of prompts."
     exit 1
 fi
 for p in "${prompts[@]}"; do
     echo "   - $p"
 done
+# echo "Asset types:"
+# for at in "${asset_types[@]}"; do
+#     echo "   - $at"
+# done
 echo "Output root: ${output_root}"
+echo "Seed: ${seed}"
+# Concatenate prompts and asset types for Python command
 prompt_args=""
+asset_type_args=""
+for i in "${!prompts[@]}"; do
+    prompt_args+="\"${prompts[$i]}\" "
+    asset_type_args+="\"${asset_types[$i]}\" "
 done
 # Step 1: Text-to-Image
 eval python3 embodied_gen/scripts/text2image.py \
     --prompts ${prompt_args} \
+    --output_root "${output_root}/images" \
+    --seed ${seed}
 # Step 2: Image-to-3D
 python3 embodied_gen/scripts/imageto3d.py \
     --image_root "${output_root}/images" \
+    --output_root "${output_root}/asset3d" \
+    --asset_type ${asset_type_args}

embodied_gen/scripts/texture_gen.sh CHANGED Viewed

@@ -10,10 +10,6 @@ while [[ $# -gt 0 ]]; do
             prompt="$2"
             shift 2
             ;;
-        --uuid)
-            uuid="$2"
-            shift 2
-            ;;
         --output_root)
             output_root="$2"
             shift 2
@@ -26,12 +22,13 @@ while [[ $# -gt 0 ]]; do
 done
-if [[ -z "$mesh_path" || -z "$prompt" || -z "$uuid" || -z "$output_root" ]]; then
     echo "params missing"
-    echo "usage: bash run.sh --mesh_path <path> --prompt <text> --uuid <id> --output_root <path>"
     exit 1
 fi
 # Step 1: drender-cli for condition rendering
 drender-cli --mesh_path ${mesh_path} \
     --output_root ${output_root}/condition \

             prompt="$2"
             shift 2
             ;;
         --output_root)
             output_root="$2"
             shift 2
 done
+if [[ -z "$mesh_path" || -z "$prompt" || -z "$output_root" ]]; then
     echo "params missing"
+    echo "usage: bash run.sh --mesh_path <path> --prompt <text> --output_root <path>"
     exit 1
 fi
+uuid=$(basename "$output_root")
 # Step 1: drender-cli for condition rendering
 drender-cli --mesh_path ${mesh_path} \
     --output_root ${output_root}/condition \

embodied_gen/trainer/gsplat_trainer.py ADDED Viewed

	@@ -0,0 +1,678 @@

+# Project EmbodiedGen
+#
+# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+# Part of the code comes from https://github.com/nerfstudio-project/gsplat/blob/main/examples/simple_trainer.py
+# Both under the Apache License, Version 2.0.
+import json
+import os
+import time
+from collections import defaultdict
+from typing import Dict, Optional, Tuple
+import cv2
+import imageio
+import numpy as np
+import torch
+import torch.nn.functional as F
+import tqdm
+import tyro
+import yaml
+from fused_ssim import fused_ssim
+from gsplat.distributed import cli
+from gsplat.rendering import rasterization
+from gsplat.strategy import DefaultStrategy, MCMCStrategy
+from torch import Tensor
+from torch.utils.tensorboard import SummaryWriter
+from torchmetrics.image import (
+    PeakSignalNoiseRatio,
+    StructuralSimilarityIndexMeasure,
+)
+from torchmetrics.image.lpip import LearnedPerceptualImagePatchSimilarity
+from typing_extensions import Literal, assert_never
+from embodied_gen.data.datasets import PanoGSplatDataset
+from embodied_gen.utils.config import GsplatTrainConfig
+from embodied_gen.utils.gaussian import (
+    create_splats_with_optimizers,
+    export_splats,
+    resize_pinhole_intrinsics,
+    set_random_seed,
+)
+class Runner:
+    """Engine for training and testing from gsplat example.
+    Code from https://github.com/nerfstudio-project/gsplat/blob/main/examples/simple_trainer.py
+    """
+    def __init__(
+        self,
+        local_rank: int,
+        world_rank,
+        world_size: int,
+        cfg: GsplatTrainConfig,
+    ) -> None:
+        set_random_seed(42 + local_rank)
+        self.cfg = cfg
+        self.world_rank = world_rank
+        self.local_rank = local_rank
+        self.world_size = world_size
+        self.device = f"cuda:{local_rank}"
+        # Where to dump results.
+        os.makedirs(cfg.result_dir, exist_ok=True)
+        # Setup output directories.
+        self.ckpt_dir = f"{cfg.result_dir}/ckpts"
+        os.makedirs(self.ckpt_dir, exist_ok=True)
+        self.stats_dir = f"{cfg.result_dir}/stats"
+        os.makedirs(self.stats_dir, exist_ok=True)
+        self.render_dir = f"{cfg.result_dir}/renders"
+        os.makedirs(self.render_dir, exist_ok=True)
+        self.ply_dir = f"{cfg.result_dir}/ply"
+        os.makedirs(self.ply_dir, exist_ok=True)
+        # Tensorboard
+        self.writer = SummaryWriter(log_dir=f"{cfg.result_dir}/tb")
+        self.trainset = PanoGSplatDataset(cfg.data_dir, split="train")
+        self.valset = PanoGSplatDataset(
+            cfg.data_dir, split="train", max_sample_num=6
+        )
+        self.testset = PanoGSplatDataset(cfg.data_dir, split="eval")
+        self.scene_scale = cfg.scene_scale
+        # Model
+        self.splats, self.optimizers = create_splats_with_optimizers(
+            self.trainset.points,
+            self.trainset.points_rgb,
+            init_num_pts=cfg.init_num_pts,
+            init_extent=cfg.init_extent,
+            init_opacity=cfg.init_opa,
+            init_scale=cfg.init_scale,
+            means_lr=cfg.means_lr,
+            scales_lr=cfg.scales_lr,
+            opacities_lr=cfg.opacities_lr,
+            quats_lr=cfg.quats_lr,
+            sh0_lr=cfg.sh0_lr,
+            shN_lr=cfg.shN_lr,
+            scene_scale=self.scene_scale,
+            sh_degree=cfg.sh_degree,
+            sparse_grad=cfg.sparse_grad,
+            visible_adam=cfg.visible_adam,
+            batch_size=cfg.batch_size,
+            feature_dim=None,
+            device=self.device,
+            world_rank=world_rank,
+            world_size=world_size,
+        )
+        print("Model initialized. Number of GS:", len(self.splats["means"]))
+        # Densification Strategy
+        self.cfg.strategy.check_sanity(self.splats, self.optimizers)
+        if isinstance(self.cfg.strategy, DefaultStrategy):
+            self.strategy_state = self.cfg.strategy.initialize_state(
+                scene_scale=self.scene_scale
+            )
+        elif isinstance(self.cfg.strategy, MCMCStrategy):
+            self.strategy_state = self.cfg.strategy.initialize_state()
+        else:
+            assert_never(self.cfg.strategy)
+        # Losses & Metrics.
+        self.ssim = StructuralSimilarityIndexMeasure(data_range=1.0).to(
+            self.device
+        )
+        self.psnr = PeakSignalNoiseRatio(data_range=1.0).to(self.device)
+        if cfg.lpips_net == "alex":
+            self.lpips = LearnedPerceptualImagePatchSimilarity(
+                net_type="alex", normalize=True
+            ).to(self.device)
+        elif cfg.lpips_net == "vgg":
+            # The 3DGS official repo uses lpips vgg, which is equivalent with the following:
+            self.lpips = LearnedPerceptualImagePatchSimilarity(
+                net_type="vgg", normalize=False
+            ).to(self.device)
+        else:
+            raise ValueError(f"Unknown LPIPS network: {cfg.lpips_net}")
+    def rasterize_splats(
+        self,
+        camtoworlds: Tensor,
+        Ks: Tensor,
+        width: int,
+        height: int,
+        masks: Optional[Tensor] = None,
+        rasterize_mode: Optional[Literal["classic", "antialiased"]] = None,
+        camera_model: Optional[Literal["pinhole", "ortho", "fisheye"]] = None,
+        **kwargs,
+    ) -> Tuple[Tensor, Tensor, Dict]:
+        means = self.splats["means"]  # [N, 3]
+        # quats = F.normalize(self.splats["quats"], dim=-1)  # [N, 4]
+        # rasterization does normalization internally
+        quats = self.splats["quats"]  # [N, 4]
+        scales = torch.exp(self.splats["scales"])  # [N, 3]
+        opacities = torch.sigmoid(self.splats["opacities"])  # [N,]
+        image_ids = kwargs.pop("image_ids", None)
+        colors = torch.cat(
+            [self.splats["sh0"], self.splats["shN"]], 1
+        )  # [N, K, 3]
+        if rasterize_mode is None:
+            rasterize_mode = (
+                "antialiased" if self.cfg.antialiased else "classic"
+            )
+        if camera_model is None:
+            camera_model = self.cfg.camera_model
+        render_colors, render_alphas, info = rasterization(
+            means=means,
+            quats=quats,
+            scales=scales,
+            opacities=opacities,
+            colors=colors,
+            viewmats=torch.linalg.inv(camtoworlds),  # [C, 4, 4]
+            Ks=Ks,  # [C, 3, 3]
+            width=width,
+            height=height,
+            packed=self.cfg.packed,
+            absgrad=(
+                self.cfg.strategy.absgrad
+                if isinstance(self.cfg.strategy, DefaultStrategy)
+                else False
+            ),
+            sparse_grad=self.cfg.sparse_grad,
+            rasterize_mode=rasterize_mode,
+            distributed=self.world_size > 1,
+            camera_model=self.cfg.camera_model,
+            with_ut=self.cfg.with_ut,
+            with_eval3d=self.cfg.with_eval3d,
+            **kwargs,
+        )
+        if masks is not None:
+            render_colors[~masks] = 0
+        return render_colors, render_alphas, info
+    def train(self):
+        cfg = self.cfg
+        device = self.device
+        world_rank = self.world_rank
+        # Dump cfg.
+        if world_rank == 0:
+            with open(f"{cfg.result_dir}/cfg.yml", "w") as f:
+                yaml.dump(vars(cfg), f)
+        max_steps = cfg.max_steps
+        init_step = 0
+        schedulers = [
+            # means has a learning rate schedule, that end at 0.01 of the initial value
+            torch.optim.lr_scheduler.ExponentialLR(
+                self.optimizers["means"], gamma=0.01 ** (1.0 / max_steps)
+            ),
+        ]
+        trainloader = torch.utils.data.DataLoader(
+            self.trainset,
+            batch_size=cfg.batch_size,
+            shuffle=True,
+            num_workers=4,
+            persistent_workers=True,
+            pin_memory=True,
+        )
+        trainloader_iter = iter(trainloader)
+        # Training loop.
+        global_tic = time.time()
+        pbar = tqdm.tqdm(range(init_step, max_steps))
+        for step in pbar:
+            try:
+                data = next(trainloader_iter)
+            except StopIteration:
+                trainloader_iter = iter(trainloader)
+                data = next(trainloader_iter)
+            camtoworlds = data["camtoworld"].to(device)  # [1, 4, 4]
+            Ks = data["K"].to(device)  # [1, 3, 3]
+            pixels = data["image"].to(device) / 255.0  # [1, H, W, 3]
+            image_ids = data["image_id"].to(device)
+            masks = (
+                data["mask"].to(device) if "mask" in data else None
+            )  # [1, H, W]
+            if cfg.depth_loss:
+                points = data["points"].to(device)  # [1, M, 2]
+                depths_gt = data["depths"].to(device)  # [1, M]
+            height, width = pixels.shape[1:3]
+            # sh schedule
+            sh_degree_to_use = min(
+                step // cfg.sh_degree_interval, cfg.sh_degree
+            )
+            # forward
+            renders, alphas, info = self.rasterize_splats(
+                camtoworlds=camtoworlds,
+                Ks=Ks,
+                width=width,
+                height=height,
+                sh_degree=sh_degree_to_use,
+                near_plane=cfg.near_plane,
+                far_plane=cfg.far_plane,
+                image_ids=image_ids,
+                render_mode="RGB+ED" if cfg.depth_loss else "RGB",
+                masks=masks,
+            )
+            if renders.shape[-1] == 4:
+                colors, depths = renders[..., 0:3], renders[..., 3:4]
+            else:
+                colors, depths = renders, None
+            if cfg.random_bkgd:
+                bkgd = torch.rand(1, 3, device=device)
+                colors = colors + bkgd * (1.0 - alphas)
+            self.cfg.strategy.step_pre_backward(
+                params=self.splats,
+                optimizers=self.optimizers,
+                state=self.strategy_state,
+                step=step,
+                info=info,
+            )
+            # loss
+            l1loss = F.l1_loss(colors, pixels)
+            ssimloss = 1.0 - fused_ssim(
+                colors.permute(0, 3, 1, 2),
+                pixels.permute(0, 3, 1, 2),
+                padding="valid",
+            )
+            loss = (
+                l1loss * (1.0 - cfg.ssim_lambda) + ssimloss * cfg.ssim_lambda
+            )
+            if cfg.depth_loss:
+                # query depths from depth map
+                points = torch.stack(
+                    [
+                        points[:, :, 0] / (width - 1) * 2 - 1,
+                        points[:, :, 1] / (height - 1) * 2 - 1,
+                    ],
+                    dim=-1,
+                )  # normalize to [-1, 1]
+                grid = points.unsqueeze(2)  # [1, M, 1, 2]
+                depths = F.grid_sample(
+                    depths.permute(0, 3, 1, 2), grid, align_corners=True
+                )  # [1, 1, M, 1]
+                depths = depths.squeeze(3).squeeze(1)  # [1, M]
+                # calculate loss in disparity space
+                disp = torch.where(
+                    depths > 0.0, 1.0 / depths, torch.zeros_like(depths)
+                )
+                disp_gt = 1.0 / depths_gt  # [1, M]
+                depthloss = F.l1_loss(disp, disp_gt) * self.scene_scale
+                loss += depthloss * cfg.depth_lambda
+            # regularizations
+            if cfg.opacity_reg > 0.0:
+                loss += (
+                    cfg.opacity_reg
+                    * torch.sigmoid(self.splats["opacities"]).mean()
+                )
+            if cfg.scale_reg > 0.0:
+                loss += cfg.scale_reg * torch.exp(self.splats["scales"]).mean()
+            loss.backward()
+            desc = (
+                f"loss={loss.item():.3f}| " f"sh degree={sh_degree_to_use}| "
+            )
+            if cfg.depth_loss:
+                desc += f"depth loss={depthloss.item():.6f}| "
+            pbar.set_description(desc)
+            # write images (gt and render)
+            # if world_rank == 0 and step % 800 == 0:
+            #     canvas = torch.cat([pixels, colors], dim=2).detach().cpu().numpy()
+            #     canvas = canvas.reshape(-1, *canvas.shape[2:])
+            #     imageio.imwrite(
+            #         f"{self.render_dir}/train_rank{self.world_rank}.png",
+            #         (canvas * 255).astype(np.uint8),
+            #     )
+            if (
+                world_rank == 0
+                and cfg.tb_every > 0
+                and step % cfg.tb_every == 0
+            ):
+                mem = torch.cuda.max_memory_allocated() / 1024**3
+                self.writer.add_scalar("train/loss", loss.item(), step)
+                self.writer.add_scalar("train/l1loss", l1loss.item(), step)
+                self.writer.add_scalar("train/ssimloss", ssimloss.item(), step)
+                self.writer.add_scalar(
+                    "train/num_GS", len(self.splats["means"]), step
+                )
+                self.writer.add_scalar("train/mem", mem, step)
+                if cfg.depth_loss:
+                    self.writer.add_scalar(
+                        "train/depthloss", depthloss.item(), step
+                    )
+                if cfg.tb_save_image:
+                    canvas = (
+                        torch.cat([pixels, colors], dim=2)
+                        .detach()
+                        .cpu()
+                        .numpy()
+                    )
+                    canvas = canvas.reshape(-1, *canvas.shape[2:])
+                    self.writer.add_image("train/render", canvas, step)
+                self.writer.flush()
+            # save checkpoint before updating the model
+            if (
+                step in [i - 1 for i in cfg.save_steps]
+                or step == max_steps - 1
+            ):
+                mem = torch.cuda.max_memory_allocated() / 1024**3
+                stats = {
+                    "mem": mem,
+                    "ellipse_time": time.time() - global_tic,
+                    "num_GS": len(self.splats["means"]),
+                }
+                print("Step: ", step, stats)
+                with open(
+                    f"{self.stats_dir}/train_step{step:04d}_rank{self.world_rank}.json",
+                    "w",
+                ) as f:
+                    json.dump(stats, f)
+                data = {"step": step, "splats": self.splats.state_dict()}
+                torch.save(
+                    data,
+                    f"{self.ckpt_dir}/ckpt_{step}_rank{self.world_rank}.pt",
+                )
+            if (
+                step in [i - 1 for i in cfg.ply_steps] or step == max_steps - 1
+            ) and cfg.save_ply:
+                sh0 = self.splats["sh0"]
+                shN = self.splats["shN"]
+                means = self.splats["means"]
+                scales = self.splats["scales"]
+                quats = self.splats["quats"]
+                opacities = self.splats["opacities"]
+                export_splats(
+                    means=means,
+                    scales=scales,
+                    quats=quats,
+                    opacities=opacities,
+                    sh0=sh0,
+                    shN=shN,
+                    format="ply",
+                    save_to=f"{self.ply_dir}/point_cloud_{step}.ply",
+                )
+            # Turn Gradients into Sparse Tensor before running optimizer
+            if cfg.sparse_grad:
+                assert (
+                    cfg.packed
+                ), "Sparse gradients only work with packed mode."
+                gaussian_ids = info["gaussian_ids"]
+                for k in self.splats.keys():
+                    grad = self.splats[k].grad
+                    if grad is None or grad.is_sparse:
+                        continue
+                    self.splats[k].grad = torch.sparse_coo_tensor(
+                        indices=gaussian_ids[None],  # [1, nnz]
+                        values=grad[gaussian_ids],  # [nnz, ...]
+                        size=self.splats[k].size(),  # [N, ...]
+                        is_coalesced=len(Ks) == 1,
+                    )
+            if cfg.visible_adam:
+                gaussian_cnt = self.splats.means.shape[0]
+                if cfg.packed:
+                    visibility_mask = torch.zeros_like(
+                        self.splats["opacities"], dtype=bool
+                    )
+                    visibility_mask.scatter_(0, info["gaussian_ids"], 1)
+                else:
+                    visibility_mask = (info["radii"] > 0).all(-1).any(0)
+            # optimize
+            for optimizer in self.optimizers.values():
+                if cfg.visible_adam:
+                    optimizer.step(visibility_mask)
+                else:
+                    optimizer.step()
+                optimizer.zero_grad(set_to_none=True)
+            for scheduler in schedulers:
+                scheduler.step()
+            # Run post-backward steps after backward and optimizer
+            if isinstance(self.cfg.strategy, DefaultStrategy):
+                self.cfg.strategy.step_post_backward(
+                    params=self.splats,
+                    optimizers=self.optimizers,
+                    state=self.strategy_state,
+                    step=step,
+                    info=info,
+                    packed=cfg.packed,
+                )
+            elif isinstance(self.cfg.strategy, MCMCStrategy):
+                self.cfg.strategy.step_post_backward(
+                    params=self.splats,
+                    optimizers=self.optimizers,
+                    state=self.strategy_state,
+                    step=step,
+                    info=info,
+                    lr=schedulers[0].get_last_lr()[0],
+                )
+            else:
+                assert_never(self.cfg.strategy)
+            # eval the full set
+            if step in [i - 1 for i in cfg.eval_steps]:
+                self.eval(step)
+                self.render_video(step)
+    @torch.no_grad()
+    def eval(
+        self,
+        step: int,
+        stage: str = "val",
+        canvas_h: int = 512,
+        canvas_w: int = 1024,
+    ):
+        """Entry for evaluation."""
+        print("Running evaluation...")
+        cfg = self.cfg
+        device = self.device
+        world_rank = self.world_rank
+        valloader = torch.utils.data.DataLoader(
+            self.valset, batch_size=1, shuffle=False, num_workers=1
+        )
+        ellipse_time = 0
+        metrics = defaultdict(list)
+        for i, data in enumerate(valloader):
+            camtoworlds = data["camtoworld"].to(device)
+            Ks = data["K"].to(device)
+            pixels = data["image"].to(device) / 255.0
+            height, width = pixels.shape[1:3]
+            masks = data["mask"].to(device) if "mask" in data else None
+            pixels = pixels.permute(0, 3, 1, 2)  # NHWC -> NCHW
+            pixels = F.interpolate(pixels, size=(canvas_h, canvas_w // 2))
+            torch.cuda.synchronize()
+            tic = time.time()
+            colors, _, _ = self.rasterize_splats(
+                camtoworlds=camtoworlds,
+                Ks=Ks,
+                width=width,
+                height=height,
+                sh_degree=cfg.sh_degree,
+                near_plane=cfg.near_plane,
+                far_plane=cfg.far_plane,
+                masks=masks,
+            )  # [1, H, W, 3]
+            torch.cuda.synchronize()
+            ellipse_time += max(time.time() - tic, 1e-10)
+            colors = colors.permute(0, 3, 1, 2)  # NHWC -> NCHW
+            colors = F.interpolate(colors, size=(canvas_h, canvas_w // 2))
+            colors = torch.clamp(colors, 0.0, 1.0)
+            canvas_list = [pixels, colors]
+            if world_rank == 0:
+                canvas = torch.cat(canvas_list, dim=2).squeeze(0)
+                canvas = canvas.permute(1, 2, 0)  # CHW -> HWC
+                canvas = (canvas * 255).to(torch.uint8).cpu().numpy()
+                cv2.imwrite(
+                    f"{self.render_dir}/{stage}_step{step}_{i:04d}.png",
+                    canvas[..., ::-1],
+                )
+                metrics["psnr"].append(self.psnr(colors, pixels))
+                metrics["ssim"].append(self.ssim(colors, pixels))
+                metrics["lpips"].append(self.lpips(colors, pixels))
+        if world_rank == 0:
+            ellipse_time /= len(valloader)
+            stats = {
+                k: torch.stack(v).mean().item() for k, v in metrics.items()
+            }
+            stats.update(
+                {
+                    "ellipse_time": ellipse_time,
+                    "num_GS": len(self.splats["means"]),
+                }
+            )
+            print(
+                f"PSNR: {stats['psnr']:.3f}, SSIM: {stats['ssim']:.4f}, LPIPS: {stats['lpips']:.3f} "
+                f"Time: {stats['ellipse_time']:.3f}s/image "
+                f"Number of GS: {stats['num_GS']}"
+            )
+            # save stats as json
+            with open(
+                f"{self.stats_dir}/{stage}_step{step:04d}.json", "w"
+            ) as f:
+                json.dump(stats, f)
+            # save stats to tensorboard
+            for k, v in stats.items():
+                self.writer.add_scalar(f"{stage}/{k}", v, step)
+            self.writer.flush()
+    @torch.no_grad()
+    def render_video(
+        self, step: int, canvas_h: int = 512, canvas_w: int = 1024
+    ):
+        testloader = torch.utils.data.DataLoader(
+            self.testset, batch_size=1, shuffle=False, num_workers=1
+        )
+        images_cache = []
+        depth_global_min, depth_global_max = float("inf"), -float("inf")
+        for data in testloader:
+            camtoworlds = data["camtoworld"].to(self.device)
+            Ks = resize_pinhole_intrinsics(
+                data["K"].squeeze(),
+                raw_hw=(data["image_h"].item(), data["image_w"].item()),
+                new_hw=(canvas_h, canvas_w // 2),
+            ).to(self.device)
+            renders, _, _ = self.rasterize_splats(
+                camtoworlds=camtoworlds,
+                Ks=Ks[None, ...],
+                width=canvas_w // 2,
+                height=canvas_h,
+                sh_degree=self.cfg.sh_degree,
+                near_plane=self.cfg.near_plane,
+                far_plane=self.cfg.far_plane,
+                render_mode="RGB+ED",
+            )  # [1, H, W, 4]
+            colors = torch.clamp(renders[0, ..., 0:3], 0.0, 1.0)  # [H, W, 3]
+            colors = (colors * 255).to(torch.uint8).cpu().numpy()
+            depths = renders[0, ..., 3:4]  # [H, W, 1], tensor in device.
+            images_cache.append([colors, depths])
+            depth_global_min = min(depth_global_min, depths.min().item())
+            depth_global_max = max(depth_global_max, depths.max().item())
+        video_path = f"{self.render_dir}/video_step{step}.mp4"
+        writer = imageio.get_writer(video_path, fps=30)
+        for rgb, depth in images_cache:
+            depth_normalized = torch.clip(
+                (depth - depth_global_min)
+                / (depth_global_max - depth_global_min),
+                0,
+                1,
+            )
+            depth_normalized = (
+                (depth_normalized * 255).to(torch.uint8).cpu().numpy()
+            )
+            depth_map = cv2.applyColorMap(depth_normalized, cv2.COLORMAP_JET)
+            image = np.concatenate([rgb, depth_map], axis=1)
+            writer.append_data(image)
+        writer.close()
+def entrypoint(
+    local_rank: int, world_rank, world_size: int, cfg: GsplatTrainConfig
+):
+    runner = Runner(local_rank, world_rank, world_size, cfg)
+    if cfg.ckpt is not None:
+        # run eval only
+        ckpts = [
+            torch.load(file, map_location=runner.device, weights_only=True)
+            for file in cfg.ckpt
+        ]
+        for k in runner.splats.keys():
+            runner.splats[k].data = torch.cat(
+                [ckpt["splats"][k] for ckpt in ckpts]
+            )
+        step = ckpts[0]["step"]
+        runner.eval(step=step)
+        runner.render_video(step=step)
+    else:
+        runner.train()
+        runner.render_video(step=cfg.max_steps - 1)
+if __name__ == "__main__":
+    configs = {
+        "default": (
+            "Gaussian splatting training using densification heuristics from the original paper.",
+            GsplatTrainConfig(
+                strategy=DefaultStrategy(verbose=True),
+            ),
+        ),
+        "mcmc": (
+            "Gaussian splatting training using densification from the paper '3D Gaussian Splatting as Markov Chain Monte Carlo'.",
+            GsplatTrainConfig(
+                init_scale=0.1,
+                opacity_reg=0.01,
+                scale_reg=0.01,
+                strategy=MCMCStrategy(verbose=True),
+            ),
+        ),
+    }
+    cfg = tyro.extras.overridable_config_cli(configs)
+    cfg.adjust_steps(cfg.steps_scaler)
+    cli(entrypoint, cfg, verbose=True)

embodied_gen/trainer/pono2mesh_trainer.py ADDED Viewed

	@@ -0,0 +1,538 @@

+# Project EmbodiedGen
+#
+# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+from embodied_gen.utils.monkey_patches import monkey_patch_pano2room
+monkey_patch_pano2room()
+import os
+import cv2
+import numpy as np
+import torch
+import trimesh
+from equilib import cube2equi, equi2pers
+from kornia.morphology import dilation
+from PIL import Image
+from embodied_gen.models.sr_model import ImageRealESRGAN
+from embodied_gen.utils.config import Pano2MeshSRConfig
+from embodied_gen.utils.gaussian import compute_pinhole_intrinsics
+from embodied_gen.utils.log import logger
+from thirdparty.pano2room.modules.geo_predictors import PanoJointPredictor
+from thirdparty.pano2room.modules.geo_predictors.PanoFusionDistancePredictor import (
+    PanoFusionDistancePredictor,
+)
+from thirdparty.pano2room.modules.inpainters import PanoPersFusionInpainter
+from thirdparty.pano2room.modules.mesh_fusion.render import (
+    features_to_world_space_mesh,
+    render_mesh,
+)
+from thirdparty.pano2room.modules.mesh_fusion.sup_info import SupInfoPool
+from thirdparty.pano2room.utils.camera_utils import gen_pano_rays
+from thirdparty.pano2room.utils.functions import (
+    depth_to_distance,
+    get_cubemap_views_world_to_cam,
+    resize_image_with_aspect_ratio,
+    rot_z_world_to_cam,
+    tensor_to_pil,
+)
+class Pano2MeshSRPipeline:
+    """Converting panoramic RGB image into 3D mesh representations, followed by inpainting and mesh refinement.
+    This class integrates several key components including:
+    - Depth estimation from RGB panorama
+    - Inpainting of missing regions under offsets
+    - RGB-D to mesh conversion
+    - Multi-view mesh repair
+    - 3D Gaussian Splatting (3DGS) dataset generation
+    Args:
+        config (Pano2MeshSRConfig): Configuration object containing model and pipeline parameters.
+    Example:
+        ```python
+        pipeline = Pano2MeshSRPipeline(config)
+        pipeline(pano_image='example.png', output_dir='./output')
+        ```
+    """
+    def __init__(self, config: Pano2MeshSRConfig) -> None:
+        self.cfg = config
+        self.device = config.device
+        # Init models.
+        self.inpainter = PanoPersFusionInpainter(save_path=None)
+        self.geo_predictor = PanoJointPredictor(save_path=None)
+        self.pano_fusion_distance_predictor = PanoFusionDistancePredictor()
+        self.super_model = ImageRealESRGAN(outscale=self.cfg.upscale_factor)
+        # Init poses.
+        cubemap_w2cs = get_cubemap_views_world_to_cam()
+        self.cubemap_w2cs = [p.to(self.device) for p in cubemap_w2cs]
+        self.camera_poses = self.load_camera_poses(self.cfg.trajectory_dir)
+        kernel = cv2.getStructuringElement(
+            cv2.MORPH_ELLIPSE, self.cfg.kernel_size
+        )
+        self.kernel = torch.from_numpy(kernel).float().to(self.device)
+    def init_mesh_params(self) -> None:
+        torch.set_default_device(self.device)
+        self.inpaint_mask = torch.ones(
+            (self.cfg.cubemap_h, self.cfg.cubemap_w), dtype=torch.bool
+        )
+        self.vertices = torch.empty((3, 0), requires_grad=False)
+        self.colors = torch.empty((3, 0), requires_grad=False)
+        self.faces = torch.empty((3, 0), dtype=torch.long, requires_grad=False)
+    @staticmethod
+    def read_camera_pose_file(filepath: str) -> np.ndarray:
+        with open(filepath, "r") as f:
+            values = [float(num) for line in f for num in line.split()]
+        return np.array(values).reshape(4, 4)
+    def load_camera_poses(
+        self, trajectory_dir: str
+    ) -> tuple[np.ndarray, list[torch.Tensor]]:
+        pose_filenames = sorted(
+            [
+                fname
+                for fname in os.listdir(trajectory_dir)
+                if fname.startswith("camera_pose")
+            ]
+        )
+        pano_pose_world = None
+        relative_poses = []
+        for idx, filename in enumerate(pose_filenames):
+            pose_path = os.path.join(trajectory_dir, filename)
+            pose_matrix = self.read_camera_pose_file(pose_path)
+            if pano_pose_world is None:
+                pano_pose_world = pose_matrix.copy()
+                pano_pose_world[0, 3] += self.cfg.pano_center_offset[0]
+                pano_pose_world[2, 3] += self.cfg.pano_center_offset[1]
+            # Use different reference for the first 6 cubemap views
+            reference_pose = pose_matrix if idx < 6 else pano_pose_world
+            relative_matrix = pose_matrix @ np.linalg.inv(reference_pose)
+            relative_matrix[0:2, :] *= -1  # flip_xy
+            relative_matrix = (
+                relative_matrix @ rot_z_world_to_cam(180).cpu().numpy()
+            )
+            relative_matrix[:3, 3] *= self.cfg.pose_scale
+            relative_matrix = torch.tensor(
+                relative_matrix, dtype=torch.float32
+            )
+            relative_poses.append(relative_matrix)
+        return relative_poses
+    def load_inpaint_poses(
+        self, poses: torch.Tensor
+    ) -> dict[int, torch.Tensor]:
+        inpaint_poses = dict()
+        sampled_views = poses[:: self.cfg.inpaint_frame_stride]
+        init_pose = torch.eye(4)
+        for idx, w2c_tensor in enumerate(sampled_views):
+            w2c = w2c_tensor.cpu().numpy().astype(np.float32)
+            c2w = np.linalg.inv(w2c)
+            pose_tensor = init_pose.clone()
+            pose_tensor[:3, 3] = torch.from_numpy(c2w[:3, 3])
+            pose_tensor[:3, 3] *= -1
+            inpaint_poses[idx] = pose_tensor.to(self.device)
+        return inpaint_poses
+    def project(self, world_to_cam: torch.Tensor):
+        (
+            project_image,
+            project_depth,
+            inpaint_mask,
+            _,
+            z_buf,
+            mesh,
+        ) = render_mesh(
+            vertices=self.vertices,
+            faces=self.faces,
+            vertex_features=self.colors,
+            H=self.cfg.cubemap_h,
+            W=self.cfg.cubemap_w,
+            fov_in_degrees=self.cfg.fov,
+            RT=world_to_cam,
+            blur_radius=self.cfg.blur_radius,
+            faces_per_pixel=self.cfg.faces_per_pixel,
+        )
+        project_image = project_image * ~inpaint_mask
+        return project_image[:3, ...], inpaint_mask, project_depth
+    def render_pano(self, pose: torch.Tensor):
+        cubemap_list = []
+        for cubemap_pose in self.cubemap_w2cs:
+            project_pose = cubemap_pose @ pose
+            rgb, inpaint_mask, depth = self.project(project_pose)
+            distance_map = depth_to_distance(depth[None, ...])
+            mask = inpaint_mask[None, ...]
+            cubemap_list.append(torch.cat([rgb, distance_map, mask], dim=0))
+        # Set default tensor type for CPU operation in cube2equi
+        with torch.device("cpu"):
+            pano_rgbd = cube2equi(
+                cubemap_list, "list", self.cfg.pano_h, self.cfg.pano_w
+            )
+        pano_rgb = pano_rgbd[:3, :, :]
+        pano_depth = pano_rgbd[3:4, :, :].squeeze(0)
+        pano_mask = pano_rgbd[4:, :, :].squeeze(0)
+        return pano_rgb, pano_depth, pano_mask
+    def rgbd_to_mesh(
+        self,
+        rgb: torch.Tensor,
+        depth: torch.Tensor,
+        inpaint_mask: torch.Tensor,
+        world_to_cam: torch.Tensor = None,
+        using_distance_map: bool = True,
+    ) -> None:
+        if world_to_cam is None:
+            world_to_cam = torch.eye(4, dtype=torch.float32).to(self.device)
+        if inpaint_mask.sum() == 0:
+            return
+        vertices, faces, colors = features_to_world_space_mesh(
+            colors=rgb.squeeze(0),
+            depth=depth,
+            fov_in_degrees=self.cfg.fov,
+            world_to_cam=world_to_cam,
+            mask=inpaint_mask,
+            faces=self.faces,
+            vertices=self.vertices,
+            using_distance_map=using_distance_map,
+            edge_threshold=0.05,
+        )
+        faces += self.vertices.shape[1]
+        self.vertices = torch.cat([self.vertices, vertices], dim=1)
+        self.colors = torch.cat([self.colors, colors], dim=1)
+        self.faces = torch.cat([self.faces, faces], dim=1)
+    def get_edge_image_by_depth(
+        self, depth: torch.Tensor, dilate_iter: int = 1
+    ) -> np.ndarray:
+        if isinstance(depth, torch.Tensor):
+            depth = depth.cpu().detach().numpy()
+        gray = (depth / depth.max() * 255).astype(np.uint8)
+        edges = cv2.Canny(gray, 60, 150)
+        if dilate_iter > 0:
+            kernel = np.ones((3, 3), np.uint8)
+            edges = cv2.dilate(edges, kernel, iterations=dilate_iter)
+        return edges
+    def mesh_repair_by_greedy_view_selection(
+        self, pose_dict: dict[str, torch.Tensor], output_dir: str
+    ) -> list:
+        inpainted_panos_w_pose = []
+        while len(pose_dict) > 0:
+            logger.info(f"Repairing mesh left rounds {len(pose_dict)}")
+            sampled_views = []
+            for key, pose in pose_dict.items():
+                pano_rgb, pano_distance, pano_mask = self.render_pano(pose)
+                completeness = torch.sum(1 - pano_mask) / (pano_mask.numel())
+                sampled_views.append((key, completeness.item(), pose))
+            if len(sampled_views) == 0:
+                break
+            # Find inpainting with least view completeness.
+            sampled_views = sorted(sampled_views, key=lambda x: x[1])
+            key, _, pose = sampled_views[len(sampled_views) * 2 // 3]
+            pose_dict.pop(key)
+            pano_rgb, pano_distance, pano_mask = self.render_pano(pose)
+            colors = pano_rgb.permute(1, 2, 0).clone()
+            distances = pano_distance.unsqueeze(-1).clone()
+            pano_inpaint_mask = pano_mask.clone()
+            init_pose = pose.clone()
+            normals = None
+            if pano_inpaint_mask.min().item() < 0.5:
+                colors, distances, normals = self.inpaint_panorama(
+                    idx=key,
+                    colors=colors,
+                    distances=distances,
+                    pano_mask=pano_inpaint_mask,
+                )
+                init_pose[0, 3], init_pose[1, 3], init_pose[2, 3] = (
+                    -pose[0, 3],
+                    pose[2, 3],
+                    0,
+                )
+                rays = gen_pano_rays(
+                    init_pose, self.cfg.pano_h, self.cfg.pano_w
+                )
+                conflict_mask = self.sup_pool.geo_check(
+                    rays, distances.unsqueeze(-1)
+                )  # 0 is conflict, 1 not conflict
+                pano_inpaint_mask *= conflict_mask
+            self.rgbd_to_mesh(
+                colors.permute(2, 0, 1),
+                distances,
+                pano_inpaint_mask,
+                world_to_cam=pose,
+            )
+            self.sup_pool.register_sup_info(
+                pose=init_pose,
+                mask=pano_inpaint_mask.clone(),
+                rgb=colors,
+                distance=distances.unsqueeze(-1),
+                normal=normals,
+            )
+            colors = colors.permute(2, 0, 1).unsqueeze(0)
+            inpainted_panos_w_pose.append([colors, pose])
+            if self.cfg.visualize:
+                from embodied_gen.data.utils import DiffrastRender
+                tensor_to_pil(pano_rgb.unsqueeze(0)).save(
+                    f"{output_dir}/rendered_pano_{key}.jpg"
+                )
+                tensor_to_pil(colors).save(
+                    f"{output_dir}/inpainted_pano_{key}.jpg"
+                )
+                norm_depth = DiffrastRender.normalize_map_by_mask(
+                    distances, torch.ones_like(distances)
+                )
+                heatmap = (norm_depth.cpu().numpy() * 255).astype(np.uint8)
+                heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
+                Image.fromarray(heatmap).save(
+                    f"{output_dir}/inpainted_depth_{key}.png"
+                )
+        return inpainted_panos_w_pose
+    def inpaint_panorama(
+        self,
+        idx: int,
+        colors: torch.Tensor,
+        distances: torch.Tensor,
+        pano_mask: torch.Tensor,
+    ) -> tuple[torch.Tensor]:
+        mask = (pano_mask[None, ..., None] > 0.5).float()
+        mask = mask.permute(0, 3, 1, 2)
+        mask = dilation(mask, kernel=self.kernel)
+        mask = mask[0, 0, ..., None]  # hwc
+        inpainted_img = self.inpainter.inpaint(idx, colors, mask)
+        inpainted_img = colors * (1 - mask) + inpainted_img * mask
+        inpainted_distances, inpainted_normals = self.geo_predictor(
+            idx,
+            inpainted_img,
+            distances[..., None],
+            mask=mask,
+            reg_loss_weight=0.0,
+            normal_loss_weight=5e-2,
+            normal_tv_loss_weight=5e-2,
+        )
+        return inpainted_img, inpainted_distances.squeeze(), inpainted_normals
+    def preprocess_pano(
+        self, image: Image.Image | str
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if isinstance(image, str):
+            image = Image.open(image)
+        image = image.convert("RGB")
+        if image.size[0] < image.size[1]:
+            image = image.transpose(Image.TRANSPOSE)
+        image = resize_image_with_aspect_ratio(image, self.cfg.pano_w)
+        image_rgb = torch.tensor(np.array(image)).permute(2, 0, 1) / 255
+        image_rgb = image_rgb.to(self.device)
+        image_depth = self.pano_fusion_distance_predictor.predict(
+            image_rgb.permute(1, 2, 0)
+        )
+        image_depth = (
+            image_depth / image_depth.max() * self.cfg.depth_scale_factor
+        )
+        return image_rgb, image_depth
+    def pano_to_perpective(
+        self, pano_image: torch.Tensor, pitch: float, yaw: float, fov: float
+    ) -> torch.Tensor:
+        rots = dict(
+            roll=0,
+            pitch=pitch,
+            yaw=yaw,
+        )
+        perspective = equi2pers(
+            equi=pano_image.squeeze(0),
+            rots=rots,
+            height=self.cfg.cubemap_h,
+            width=self.cfg.cubemap_w,
+            fov_x=fov,
+            mode="bilinear",
+        ).unsqueeze(0)
+        return perspective
+    def pano_to_cubemap(self, pano_rgb: torch.Tensor):
+        # Define six canonical cube directions in (pitch, yaw)
+        directions = [
+            (0, 0),
+            (0, 1.5 * np.pi),
+            (0, 1.0 * np.pi),
+            (0, 0.5 * np.pi),
+            (-0.5 * np.pi, 0),
+            (0.5 * np.pi, 0),
+        ]
+        cubemaps_rgb = []
+        for pitch, yaw in directions:
+            rgb_view = self.pano_to_perpective(
+                pano_rgb, pitch, yaw, fov=self.cfg.fov
+            )
+            cubemaps_rgb.append(rgb_view.cpu())
+        return cubemaps_rgb
+    def save_mesh(self, output_path: str) -> None:
+        vertices_np = self.vertices.T.cpu().numpy()
+        colors_np = self.colors.T.cpu().numpy()
+        faces_np = self.faces.T.cpu().numpy()
+        mesh = trimesh.Trimesh(
+            vertices=vertices_np, faces=faces_np, vertex_colors=colors_np
+        )
+        mesh.export(output_path)
+    def mesh_pose_to_gs_pose(self, mesh_pose: torch.Tensor) -> np.ndarray:
+        pose = mesh_pose.clone()
+        pose[0, :] *= -1
+        pose[1, :] *= -1
+        Rw2c = pose[:3, :3].cpu().numpy()
+        Tw2c = pose[:3, 3:].cpu().numpy()
+        yz_reverse = np.array([[1, 0, 0], [0, -1, 0], [0, 0, -1]])
+        Rc2w = (yz_reverse @ Rw2c).T
+        Tc2w = -(Rc2w @ yz_reverse @ Tw2c)
+        c2w = np.concatenate((Rc2w, Tc2w), axis=1)
+        c2w = np.concatenate((c2w, np.array([[0, 0, 0, 1]])), axis=0)
+        return c2w
+    def __call__(self, pano_image: Image.Image | str, output_dir: str):
+        self.init_mesh_params()
+        pano_rgb, pano_depth = self.preprocess_pano(pano_image)
+        self.sup_pool = SupInfoPool()
+        self.sup_pool.register_sup_info(
+            pose=torch.eye(4).to(self.device),
+            mask=torch.ones([self.cfg.pano_h, self.cfg.pano_w]),
+            rgb=pano_rgb.permute(1, 2, 0),
+            distance=pano_depth[..., None],
+        )
+        self.sup_pool.gen_occ_grid(res=256)
+        logger.info("Init mesh from pano RGBD image...")
+        depth_edge = self.get_edge_image_by_depth(pano_depth)
+        inpaint_edge_mask = (
+            ~torch.from_numpy(depth_edge).to(self.device).bool()
+        )
+        self.rgbd_to_mesh(pano_rgb, pano_depth, inpaint_edge_mask)
+        repair_poses = self.load_inpaint_poses(self.camera_poses)
+        inpainted_panos_w_poses = self.mesh_repair_by_greedy_view_selection(
+            repair_poses, output_dir
+        )
+        torch.cuda.empty_cache()
+        torch.set_default_device("cpu")
+        if self.cfg.mesh_file is not None:
+            mesh_path = os.path.join(output_dir, self.cfg.mesh_file)
+            self.save_mesh(mesh_path)
+        if self.cfg.gs_data_file is None:
+            return
+        logger.info(f"Dump data for 3DGS training...")
+        points_rgb = (self.colors.clip(0, 1) * 255).to(torch.uint8)
+        data = {
+            "points": self.vertices.permute(1, 0).cpu().numpy(),  # (N, 3)
+            "points_rgb": points_rgb.permute(1, 0).cpu().numpy(),  # (N, 3)
+            "train": [],
+            "eval": [],
+        }
+        image_h = self.cfg.cubemap_h * self.cfg.upscale_factor
+        image_w = self.cfg.cubemap_w * self.cfg.upscale_factor
+        Ks = compute_pinhole_intrinsics(image_w, image_h, self.cfg.fov)
+        for idx, (pano_img, pano_pose) in enumerate(inpainted_panos_w_poses):
+            cubemaps = self.pano_to_cubemap(pano_img)
+            for i in range(len(cubemaps)):
+                cubemap = tensor_to_pil(cubemaps[i])
+                cubemap = self.super_model(cubemap)
+                mesh_pose = self.cubemap_w2cs[i] @ pano_pose
+                c2w = self.mesh_pose_to_gs_pose(mesh_pose)
+                data["train"].append(
+                    {
+                        "camtoworld": c2w.astype(np.float32),
+                        "K": Ks.astype(np.float32),
+                        "image": np.array(cubemap),
+                        "image_h": image_h,
+                        "image_w": image_w,
+                        "image_id": len(cubemaps) * idx + i,
+                    }
+                )
+        # Camera poses for evaluation.
+        for idx in range(len(self.camera_poses)):
+            c2w = self.mesh_pose_to_gs_pose(self.camera_poses[idx])
+            data["eval"].append(
+                {
+                    "camtoworld": c2w.astype(np.float32),
+                    "K": Ks.astype(np.float32),
+                    "image_h": image_h,
+                    "image_w": image_w,
+                    "image_id": idx,
+                }
+            )
+        data_path = os.path.join(output_dir, self.cfg.gs_data_file)
+        torch.save(data, data_path)
+        return
+if __name__ == "__main__":
+    output_dir = "outputs/bg_v2/test3"
+    input_pano = "apps/assets/example_scene/result_pano.png"
+    config = Pano2MeshSRConfig()
+    pipeline = Pano2MeshSRPipeline(config)
+    pipeline(input_pano, output_dir)

embodied_gen/utils/config.py ADDED Viewed

	@@ -0,0 +1,190 @@

+# Project EmbodiedGen
+#
+# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+from dataclasses import dataclass, field
+from typing import List, Optional, Union
+from gsplat.strategy import DefaultStrategy, MCMCStrategy
+from typing_extensions import Literal, assert_never
+__all__ = [
+    "Pano2MeshSRConfig",
+    "GsplatTrainConfig",
+]
+@dataclass
+class Pano2MeshSRConfig:
+    mesh_file: str = "mesh_model.ply"
+    gs_data_file: str = "gs_data.pt"
+    device: str = "cuda"
+    blur_radius: int = 0
+    faces_per_pixel: int = 8
+    fov: int = 90
+    pano_w: int = 2048
+    pano_h: int = 1024
+    cubemap_w: int = 512
+    cubemap_h: int = 512
+    pose_scale: float = 0.6
+    pano_center_offset: tuple = (-0.2, 0.3)
+    inpaint_frame_stride: int = 20
+    trajectory_dir: str = "apps/assets/example_scene/camera_trajectory"
+    visualize: bool = False
+    depth_scale_factor: float = 3.4092
+    kernel_size: tuple = (9, 9)
+    upscale_factor: int = 4
+@dataclass
+class GsplatTrainConfig:
+    # Path to the .pt files. If provide, it will skip training and run evaluation only.
+    ckpt: Optional[List[str]] = None
+    # Render trajectory path
+    render_traj_path: str = "interp"
+    # Path to the Mip-NeRF 360 dataset
+    data_dir: str = "outputs/bg"
+    # Downsample factor for the dataset
+    data_factor: int = 4
+    # Directory to save results
+    result_dir: str = "outputs/bg"
+    # Every N images there is a test image
+    test_every: int = 8
+    # Random crop size for training  (experimental)
+    patch_size: Optional[int] = None
+    # A global scaler that applies to the scene size related parameters
+    global_scale: float = 1.0
+    # Normalize the world space
+    normalize_world_space: bool = True
+    # Camera model
+    camera_model: Literal["pinhole", "ortho", "fisheye"] = "pinhole"
+    # Port for the viewer server
+    port: int = 8080
+    # Batch size for training. Learning rates are scaled automatically
+    batch_size: int = 1
+    # A global factor to scale the number of training steps
+    steps_scaler: float = 1.0
+    # Number of training steps
+    max_steps: int = 30_000
+    # Steps to evaluate the model
+    eval_steps: List[int] = field(default_factory=lambda: [7_000, 30_000])
+    # Steps to save the model
+    save_steps: List[int] = field(default_factory=lambda: [7_000, 30_000])
+    # Whether to save ply file (storage size can be large)
+    save_ply: bool = True
+    # Steps to save the model as ply
+    ply_steps: List[int] = field(default_factory=lambda: [7_000, 30_000])
+    # Whether to disable video generation during training and evaluation
+    disable_video: bool = False
+    # Initial number of GSs. Ignored if using sfm
+    init_num_pts: int = 100_000
+    # Initial extent of GSs as a multiple of the camera extent. Ignored if using sfm
+    init_extent: float = 3.0
+    # Degree of spherical harmonics
+    sh_degree: int = 1
+    # Turn on another SH degree every this steps
+    sh_degree_interval: int = 1000
+    # Initial opacity of GS
+    init_opa: float = 0.1
+    # Initial scale of GS
+    init_scale: float = 1.0
+    # Weight for SSIM loss
+    ssim_lambda: float = 0.2
+    # Near plane clipping distance
+    near_plane: float = 0.01
+    # Far plane clipping distance
+    far_plane: float = 1e10
+    # Strategy for GS densification
+    strategy: Union[DefaultStrategy, MCMCStrategy] = field(
+        default_factory=DefaultStrategy
+    )
+    # Use packed mode for rasterization, this leads to less memory usage but slightly slower.
+    packed: bool = False
+    # Use sparse gradients for optimization. (experimental)
+    sparse_grad: bool = False
+    # Use visible adam from Taming 3DGS. (experimental)
+    visible_adam: bool = False
+    # Anti-aliasing in rasterization. Might slightly hurt quantitative metrics.
+    antialiased: bool = False
+    # Use random background for training to discourage transparency
+    random_bkgd: bool = False
+    # LR for 3D point positions
+    means_lr: float = 1.6e-4
+    # LR for Gaussian scale factors
+    scales_lr: float = 5e-3
+    # LR for alpha blending weights
+    opacities_lr: float = 5e-2
+    # LR for orientation (quaternions)
+    quats_lr: float = 1e-3
+    # LR for SH band 0 (brightness)
+    sh0_lr: float = 2.5e-3
+    # LR for higher-order SH (detail)
+    shN_lr: float = 2.5e-3 / 20
+    # Opacity regularization
+    opacity_reg: float = 0.0
+    # Scale regularization
+    scale_reg: float = 0.0
+    # Enable depth loss. (experimental)
+    depth_loss: bool = False
+    # Weight for depth loss
+    depth_lambda: float = 1e-2
+    # Dump information to tensorboard every this steps
+    tb_every: int = 200
+    # Save training images to tensorboard
+    tb_save_image: bool = False
+    lpips_net: Literal["vgg", "alex"] = "alex"
+    # 3DGUT (uncented transform + eval 3D)
+    with_ut: bool = False
+    with_eval3d: bool = False
+    scene_scale: float = 1.0
+    def adjust_steps(self, factor: float):
+        self.eval_steps = [int(i * factor) for i in self.eval_steps]
+        self.save_steps = [int(i * factor) for i in self.save_steps]
+        self.ply_steps = [int(i * factor) for i in self.ply_steps]
+        self.max_steps = int(self.max_steps * factor)
+        self.sh_degree_interval = int(self.sh_degree_interval * factor)
+        strategy = self.strategy
+        if isinstance(strategy, DefaultStrategy):
+            strategy.refine_start_iter = int(
+                strategy.refine_start_iter * factor
+            )
+            strategy.refine_stop_iter = int(strategy.refine_stop_iter * factor)
+            strategy.reset_every = int(strategy.reset_every * factor)
+            strategy.refine_every = int(strategy.refine_every * factor)
+        elif isinstance(strategy, MCMCStrategy):
+            strategy.refine_start_iter = int(
+                strategy.refine_start_iter * factor
+            )
+            strategy.refine_stop_iter = int(strategy.refine_stop_iter * factor)
+            strategy.refine_every = int(strategy.refine_every * factor)
+        else:
+            assert_never(strategy)

embodied_gen/utils/enum.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# Project EmbodiedGen
+#
+# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+from dataclasses import dataclass, field
+from enum import Enum
+from dataclasses_json import DataClassJsonMixin
+__all__ = [
+    "RenderItems",
+    "Scene3DItemEnum",
+    "SpatialRelationEnum",
+    "RobotItemEnum",
+]
+@dataclass
+class RenderItems(str, Enum):
+    IMAGE = "image_color"
+    ALPHA = "image_mask"
+    VIEW_NORMAL = "image_view_normal"
+    GLOBAL_NORMAL = "image_global_normal"
+    POSITION_MAP = "image_position"
+    DEPTH = "image_depth"
+    ALBEDO = "image_albedo"
+    DIFFUSE = "image_diffuse"
+@dataclass
+class Scene3DItemEnum(str, Enum):
+    BACKGROUND = "background"
+    CONTEXT = "context"
+    ROBOT = "robot"
+    MANIPULATED_OBJS = "manipulated_objs"
+    DISTRACTOR_OBJS = "distractor_objs"
+    OTHERS = "others"
+    @classmethod
+    def object_list(cls, layout_relation: dict) -> list:
+        return (
+            [
+                layout_relation[cls.BACKGROUND.value],
+                layout_relation[cls.CONTEXT.value],
+            ]
+            + layout_relation[cls.MANIPULATED_OBJS.value]
+            + layout_relation[cls.DISTRACTOR_OBJS.value]
+        )
+    @classmethod
+    def object_mapping(cls, layout_relation):
+        relation_mapping = {
+            # layout_relation[cls.ROBOT.value]: cls.ROBOT.value,
+            layout_relation[cls.BACKGROUND.value]: cls.BACKGROUND.value,
+            layout_relation[cls.CONTEXT.value]: cls.CONTEXT.value,
+        }
+        relation_mapping.update(
+            {
+                item: cls.MANIPULATED_OBJS.value
+                for item in layout_relation[cls.MANIPULATED_OBJS.value]
+            }
+        )
+        relation_mapping.update(
+            {
+                item: cls.DISTRACTOR_OBJS.value
+                for item in layout_relation[cls.DISTRACTOR_OBJS.value]
+            }
+        )
+        return relation_mapping
+@dataclass
+class SpatialRelationEnum(str, Enum):
+    ON = "ON"  # objects on the table
+    IN = "IN"  # objects in the room
+    INSIDE = "INSIDE"  # objects inside the shelf/rack
+    FLOOR = "FLOOR"  # object floor room/bin
+@dataclass
+class RobotItemEnum(str, Enum):
+    FRANKA = "franka"
+    UR5 = "ur5"
+    PIPER = "piper"
+@dataclass
+class LayoutInfo(DataClassJsonMixin):
+    tree: dict[str, list]
+    relation: dict[str, str | list[str]]
+    objs_desc: dict[str, str] = field(default_factory=dict)
+    assets: dict[str, str] = field(default_factory=dict)
+    quality: dict[str, str] = field(default_factory=dict)
+    position: dict[str, list[float]] = field(default_factory=dict)

embodied_gen/utils/gaussian.py ADDED Viewed

	@@ -0,0 +1,331 @@

+# Project EmbodiedGen
+#
+# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+# Part of the code comes from https://github.com/nerfstudio-project/gsplat
+# Both under the Apache License, Version 2.0.
+import math
+import random
+from io import BytesIO
+from typing import Dict, Literal, Optional, Tuple
+import numpy as np
+import torch
+import trimesh
+from gsplat.optimizers import SelectiveAdam
+from scipy.spatial.transform import Rotation
+from sklearn.neighbors import NearestNeighbors
+from torch import Tensor
+from embodied_gen.models.gs_model import GaussianOperator
+__all__ = [
+    "set_random_seed",
+    "export_splats",
+    "create_splats_with_optimizers",
+    "compute_pinhole_intrinsics",
+    "resize_pinhole_intrinsics",
+    "restore_scene_scale_and_position",
+]
+def knn(x: Tensor, K: int = 4) -> Tensor:
+    x_np = x.cpu().numpy()
+    model = NearestNeighbors(n_neighbors=K, metric="euclidean").fit(x_np)
+    distances, _ = model.kneighbors(x_np)
+    return torch.from_numpy(distances).to(x)
+def rgb_to_sh(rgb: Tensor) -> Tensor:
+    C0 = 0.28209479177387814
+    return (rgb - 0.5) / C0
+def set_random_seed(seed: int):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+def splat2ply_bytes(
+    means: torch.Tensor,
+    scales: torch.Tensor,
+    quats: torch.Tensor,
+    opacities: torch.Tensor,
+    sh0: torch.Tensor,
+    shN: torch.Tensor,
+) -> bytes:
+    num_splats = means.shape[0]
+    buffer = BytesIO()
+    # Write PLY header
+    buffer.write(b"ply\n")
+    buffer.write(b"format binary_little_endian 1.0\n")
+    buffer.write(f"element vertex {num_splats}\n".encode())
+    buffer.write(b"property float x\n")
+    buffer.write(b"property float y\n")
+    buffer.write(b"property float z\n")
+    for i, data in enumerate([sh0, shN]):
+        prefix = "f_dc" if i == 0 else "f_rest"
+        for j in range(data.shape[1]):
+            buffer.write(f"property float {prefix}_{j}\n".encode())
+    buffer.write(b"property float opacity\n")
+    for i in range(scales.shape[1]):
+        buffer.write(f"property float scale_{i}\n".encode())
+    for i in range(quats.shape[1]):
+        buffer.write(f"property float rot_{i}\n".encode())
+    buffer.write(b"end_header\n")
+    # Concatenate all tensors in the correct order
+    splat_data = torch.cat(
+        [means, sh0, shN, opacities.unsqueeze(1), scales, quats], dim=1
+    )
+    # Ensure correct dtype
+    splat_data = splat_data.to(torch.float32)
+    # Write binary data
+    float_dtype = np.dtype(np.float32).newbyteorder("<")
+    buffer.write(
+        splat_data.detach().cpu().numpy().astype(float_dtype).tobytes()
+    )
+    return buffer.getvalue()
+def export_splats(
+    means: torch.Tensor,
+    scales: torch.Tensor,
+    quats: torch.Tensor,
+    opacities: torch.Tensor,
+    sh0: torch.Tensor,
+    shN: torch.Tensor,
+    format: Literal["ply"] = "ply",
+    save_to: Optional[str] = None,
+) -> bytes:
+    """Export a Gaussian Splats model to bytes in PLY file format."""
+    total_splats = means.shape[0]
+    assert means.shape == (total_splats, 3), "Means must be of shape (N, 3)"
+    assert scales.shape == (total_splats, 3), "Scales must be of shape (N, 3)"
+    assert quats.shape == (
+        total_splats,
+        4,
+    ), "Quaternions must be of shape (N, 4)"
+    assert opacities.shape == (
+        total_splats,
+    ), "Opacities must be of shape (N,)"
+    assert sh0.shape == (total_splats, 1, 3), "sh0 must be of shape (N, 1, 3)"
+    assert (
+        shN.ndim == 3 and shN.shape[0] == total_splats and shN.shape[2] == 3
+    ), f"shN must be of shape (N, K, 3), got {shN.shape}"
+    # Reshape spherical harmonics
+    sh0 = sh0.squeeze(1)  # Shape (N, 3)
+    shN = shN.permute(0, 2, 1).reshape(means.shape[0], -1)  # Shape (N, K * 3)
+    # Check for NaN or Inf values
+    invalid_mask = (
+        torch.isnan(means).any(dim=1)
+        | torch.isinf(means).any(dim=1)
+        | torch.isnan(scales).any(dim=1)
+        | torch.isinf(scales).any(dim=1)
+        | torch.isnan(quats).any(dim=1)
+        | torch.isinf(quats).any(dim=1)
+        | torch.isnan(opacities).any(dim=0)
+        | torch.isinf(opacities).any(dim=0)
+        | torch.isnan(sh0).any(dim=1)
+        | torch.isinf(sh0).any(dim=1)
+        | torch.isnan(shN).any(dim=1)
+        | torch.isinf(shN).any(dim=1)
+    )
+    # Filter out invalid entries
+    valid_mask = ~invalid_mask
+    means = means[valid_mask]
+    scales = scales[valid_mask]
+    quats = quats[valid_mask]
+    opacities = opacities[valid_mask]
+    sh0 = sh0[valid_mask]
+    shN = shN[valid_mask]
+    if format == "ply":
+        data = splat2ply_bytes(means, scales, quats, opacities, sh0, shN)
+    else:
+        raise ValueError(f"Unsupported format: {format}")
+    if save_to:
+        with open(save_to, "wb") as binary_file:
+            binary_file.write(data)
+    return data
+def create_splats_with_optimizers(
+    points: np.ndarray = None,
+    points_rgb: np.ndarray = None,
+    init_num_pts: int = 100_000,
+    init_extent: float = 3.0,
+    init_opacity: float = 0.1,
+    init_scale: float = 1.0,
+    means_lr: float = 1.6e-4,
+    scales_lr: float = 5e-3,
+    opacities_lr: float = 5e-2,
+    quats_lr: float = 1e-3,
+    sh0_lr: float = 2.5e-3,
+    shN_lr: float = 2.5e-3 / 20,
+    scene_scale: float = 1.0,
+    sh_degree: int = 3,
+    sparse_grad: bool = False,
+    visible_adam: bool = False,
+    batch_size: int = 1,
+    feature_dim: Optional[int] = None,
+    device: str = "cuda",
+    world_rank: int = 0,
+    world_size: int = 1,
+) -> Tuple[torch.nn.ParameterDict, Dict[str, torch.optim.Optimizer]]:
+    if points is not None and points_rgb is not None:
+        points = torch.from_numpy(points).float()
+        rgbs = torch.from_numpy(points_rgb / 255.0).float()
+    else:
+        points = (
+            init_extent * scene_scale * (torch.rand((init_num_pts, 3)) * 2 - 1)
+        )
+        rgbs = torch.rand((init_num_pts, 3))
+    # Initialize the GS size to be the average dist of the 3 nearest neighbors
+    dist2_avg = (knn(points, 4)[:, 1:] ** 2).mean(dim=-1)  # [N,]
+    dist_avg = torch.sqrt(dist2_avg)
+    scales = (
+        torch.log(dist_avg * init_scale).unsqueeze(-1).repeat(1, 3)
+    )  # [N, 3]
+    # Distribute the GSs to different ranks (also works for single rank)
+    points = points[world_rank::world_size]
+    rgbs = rgbs[world_rank::world_size]
+    scales = scales[world_rank::world_size]
+    N = points.shape[0]
+    quats = torch.rand((N, 4))  # [N, 4]
+    opacities = torch.logit(torch.full((N,), init_opacity))  # [N,]
+    params = [
+        # name, value, lr
+        ("means", torch.nn.Parameter(points), means_lr * scene_scale),
+        ("scales", torch.nn.Parameter(scales), scales_lr),
+        ("quats", torch.nn.Parameter(quats), quats_lr),
+        ("opacities", torch.nn.Parameter(opacities), opacities_lr),
+    ]
+    if feature_dim is None:
+        # color is SH coefficients.
+        colors = torch.zeros((N, (sh_degree + 1) ** 2, 3))  # [N, K, 3]
+        colors[:, 0, :] = rgb_to_sh(rgbs)
+        params.append(("sh0", torch.nn.Parameter(colors[:, :1, :]), sh0_lr))
+        params.append(("shN", torch.nn.Parameter(colors[:, 1:, :]), shN_lr))
+    else:
+        # features will be used for appearance and view-dependent shading
+        features = torch.rand(N, feature_dim)  # [N, feature_dim]
+        params.append(("features", torch.nn.Parameter(features), sh0_lr))
+        colors = torch.logit(rgbs)  # [N, 3]
+        params.append(("colors", torch.nn.Parameter(colors), sh0_lr))
+    splats = torch.nn.ParameterDict({n: v for n, v, _ in params}).to(device)
+    # Scale learning rate based on batch size, reference:
+    # https://www.cs.princeton.edu/~smalladi/blog/2024/01/22/SDEs-ScalingRules/
+    # Note that this would not make the training exactly equivalent, see
+    # https://arxiv.org/pdf/2402.18824v1
+    BS = batch_size * world_size
+    optimizer_class = None
+    if sparse_grad:
+        optimizer_class = torch.optim.SparseAdam
+    elif visible_adam:
+        optimizer_class = SelectiveAdam
+    else:
+        optimizer_class = torch.optim.Adam
+    optimizers = {
+        name: optimizer_class(
+            [{"params": splats[name], "lr": lr * math.sqrt(BS), "name": name}],
+            eps=1e-15 / math.sqrt(BS),
+            # TODO: check betas logic when BS is larger than 10 betas[0] will be zero.
+            betas=(1 - BS * (1 - 0.9), 1 - BS * (1 - 0.999)),
+        )
+        for name, _, lr in params
+    }
+    return splats, optimizers
+def compute_pinhole_intrinsics(
+    image_w: int, image_h: int, fov_deg: float
+) -> np.ndarray:
+    fov_rad = np.deg2rad(fov_deg)
+    fx = image_w / (2 * np.tan(fov_rad / 2))
+    fy = fx  # assuming square pixels
+    cx = image_w / 2
+    cy = image_h / 2
+    K = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])
+    return K
+def resize_pinhole_intrinsics(
+    raw_K: np.ndarray | torch.Tensor,
+    raw_hw: tuple[int, int],
+    new_hw: tuple[int, int],
+) -> np.ndarray:
+    raw_h, raw_w = raw_hw
+    new_h, new_w = new_hw
+    scale_x = new_w / raw_w
+    scale_y = new_h / raw_h
+    new_K = raw_K.copy() if isinstance(raw_K, np.ndarray) else raw_K.clone()
+    new_K[0, 0] *= scale_x  # fx
+    new_K[0, 2] *= scale_x  # cx
+    new_K[1, 1] *= scale_y  # fy
+    new_K[1, 2] *= scale_y  # cy
+    return new_K
+def restore_scene_scale_and_position(
+    real_height: float, mesh_path: str, gs_path: str
+) -> None:
+    """Scales a mesh and corresponding GS model to match a given real-world height.
+    Uses the 1st and 99th percentile of mesh Z-axis to estimate height,
+    applies scaling and vertical alignment, and updates both the mesh and GS model.
+    Args:
+        real_height (float): Target real-world height among Z axis.
+        mesh_path (str): Path to the input mesh file.
+        gs_path (str): Path to the Gaussian Splatting model file.
+    """
+    mesh = trimesh.load(mesh_path)
+    z_min = np.percentile(mesh.vertices[:, 1], 1)
+    z_max = np.percentile(mesh.vertices[:, 1], 99)
+    height = z_max - z_min
+    scale = real_height / height
+    rot = Rotation.from_quat([0, 1, 0, 0])
+    mesh.vertices = rot.apply(mesh.vertices)
+    mesh.vertices[:, 1] -= z_min
+    mesh.vertices *= scale
+    mesh.export(mesh_path)
+    gs_model: GaussianOperator = GaussianOperator.load_from_ply(gs_path)
+    gs_model = gs_model.get_gaussians(
+        instance_pose=torch.tensor([0.0, -z_min, 0, 0, 1, 0, 0])
+    )
+    gs_model.rescale(scale)
+    gs_model.save_to_ply(gs_path)

embodied_gen/utils/gpt_clients.py CHANGED Viewed

@@ -30,12 +30,20 @@ from tenacity import (
     stop_after_delay,
     wait_random_exponential,
 )
-from embodied_gen.utils.process_media import combine_images_to_base64
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class GPTclient:
     """A client to interact with the GPT model via OpenAI or Azure API."""
@@ -45,6 +53,7 @@ class GPTclient:
         api_key: str,
         model_name: str = "yfb-gpt-4o",
         api_version: str = None,
         verbose: bool = False,
     ):
         if api_version is not None:
@@ -63,6 +72,9 @@ class GPTclient:
         self.model_name = model_name
         self.image_formats = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".gif"}
         self.verbose = verbose
         logger.info(f"Using GPT model: {self.model_name}.")
     @retry(
@@ -77,6 +89,7 @@ class GPTclient:
         text_prompt: str,
         image_base64: Optional[list[str | Image.Image]] = None,
         system_role: Optional[str] = None,
     ) -> Optional[str]:
         """Queries the GPT model with a text and optional image prompts.
@@ -86,6 +99,7 @@ class GPTclient:
                 or local image paths or PIL.Image to accompany the text prompt.
             system_role (Optional[str]): Optional system-level instructions
                 that specify the behavior of the assistant.
         Returns:
             Optional[str]: The response content generated by the model based on
@@ -103,11 +117,11 @@ class GPTclient:
         # Process images if provided
         if image_base64 is not None:
-            image_base64 = (
-                image_base64
-                if isinstance(image_base64, list)
-                else [image_base64]
-            )
             for img in image_base64:
                 if isinstance(img, Image.Image):
                     buffer = BytesIO()
@@ -142,8 +156,11 @@ class GPTclient:
             "frequency_penalty": 0,
             "presence_penalty": 0,
             "stop": None,
         }
-        payload.update({"model": self.model_name})
         response = None
         try:
@@ -159,8 +176,28 @@ class GPTclient:
         return response
-with open("embodied_gen/utils/gpt_config.yaml", "r") as f:
     config = yaml.safe_load(f)
 agent_type = config["agent_type"]
@@ -177,32 +214,5 @@ GPT_CLIENT = GPTclient(
     api_key=api_key,
     api_version=api_version,
     model_name=model_name,
 )
-if __name__ == "__main__":
-    if "openrouter" in GPT_CLIENT.endpoint:
-        response = GPT_CLIENT.query(
-            text_prompt="What is the content in each image?",
-            image_base64=combine_images_to_base64(
-                [
-                    "apps/assets/example_image/sample_02.jpg",
-                    "apps/assets/example_image/sample_03.jpg",
-                ]
-            ),  # input raw image_path if only one image
-        )
-        print(response)
-    else:
-        response = GPT_CLIENT.query(
-            text_prompt="What is the content in the images?",
-            image_base64=[
-                Image.open("apps/assets/example_image/sample_02.jpg"),
-                Image.open("apps/assets/example_image/sample_03.jpg"),
-            ],
-        )
-        print(response)
-        # test2: text prompt
-        response = GPT_CLIENT.query(
-            text_prompt="What is the capital of China?"
-        )
-        print(response)

     stop_after_delay,
     wait_random_exponential,
 )
+from embodied_gen.utils.process_media import combine_images_to_grid
+logging.getLogger("httpx").setLevel(logging.WARNING)
+logging.basicConfig(level=logging.WARNING)
 logger = logging.getLogger(__name__)
+__all__ = [
+    "GPTclient",
+]
+CONFIG_FILE = "embodied_gen/utils/gpt_config.yaml"
 class GPTclient:
     """A client to interact with the GPT model via OpenAI or Azure API."""
         api_key: str,
         model_name: str = "yfb-gpt-4o",
         api_version: str = None,
+        check_connection: bool = True,
         verbose: bool = False,
     ):
         if api_version is not None:
         self.model_name = model_name
         self.image_formats = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".gif"}
         self.verbose = verbose
+        if check_connection:
+            self.check_connection()
         logger.info(f"Using GPT model: {self.model_name}.")
     @retry(
         text_prompt: str,
         image_base64: Optional[list[str | Image.Image]] = None,
         system_role: Optional[str] = None,
+        params: Optional[dict] = None,
     ) -> Optional[str]:
         """Queries the GPT model with a text and optional image prompts.
                 or local image paths or PIL.Image to accompany the text prompt.
             system_role (Optional[str]): Optional system-level instructions
                 that specify the behavior of the assistant.
+            params (Optional[dict]): Additional parameters for GPT setting.
         Returns:
             Optional[str]: The response content generated by the model based on
         # Process images if provided
         if image_base64 is not None:
+            if not isinstance(image_base64, list):
+                image_base64 = [image_base64]
+            # Hardcode tmp because of the openrouter can't input multi images.
+            if "openrouter" in self.endpoint:
+                image_base64 = combine_images_to_grid(image_base64)
             for img in image_base64:
                 if isinstance(img, Image.Image):
                     buffer = BytesIO()
             "frequency_penalty": 0,
             "presence_penalty": 0,
             "stop": None,
+            "model": self.model_name,
         }
+        if params:
+            payload.update(params)
         response = None
         try:
         return response
+    def check_connection(self) -> None:
+        """Check whether the GPT API connection is working."""
+        try:
+            response = self.completion_with_backoff(
+                messages=[
+                    {"role": "system", "content": "You are a test system."},
+                    {"role": "user", "content": "Hello"},
+                ],
+                model=self.model_name,
+                temperature=0,
+                max_tokens=100,
+            )
+            content = response.choices[0].message.content
+            logger.info(f"Connection check success.")
+        except Exception as e:
+            raise ConnectionError(
+                f"Failed to connect to GPT API at {self.endpoint}, "
+                f"please check setting in `{CONFIG_FILE}` and `README`."
+            )
+with open(CONFIG_FILE, "r") as f:
     config = yaml.safe_load(f)
 agent_type = config["agent_type"]
     api_key=api_key,
     api_version=api_version,
     model_name=model_name,
+    check_connection=False,
 )

embodied_gen/utils/log.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# Project EmbodiedGen
+#
+# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+import logging
+from colorlog import ColoredFormatter
+__all__ = [
+    "logger",
+]
+LOG_FORMAT = (
+    "%(log_color)s[%(asctime)s] %(levelname)-8s | %(message)s%(reset)s"
+)
+DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
+formatter = ColoredFormatter(
+    LOG_FORMAT,
+    datefmt=DATE_FORMAT,
+    log_colors={
+        "DEBUG": "cyan",
+        "INFO": "green",
+        "WARNING": "yellow",
+        "ERROR": "red",
+        "CRITICAL": "bold_red",
+    },
+)
+handler = logging.StreamHandler()
+handler.setFormatter(formatter)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+logger.addHandler(handler)
+logger.propagate = False

embodied_gen/utils/monkey_patches.py ADDED Viewed

	@@ -0,0 +1,152 @@

+# Project EmbodiedGen
+#
+# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+import os
+import sys
+import zipfile
+import torch
+from huggingface_hub import hf_hub_download
+from omegaconf import OmegaConf
+from PIL import Image
+from torchvision import transforms
+def monkey_patch_pano2room():
+    current_file_path = os.path.abspath(__file__)
+    current_dir = os.path.dirname(current_file_path)
+    sys.path.append(os.path.join(current_dir, "../.."))
+    sys.path.append(os.path.join(current_dir, "../../thirdparty/pano2room"))
+    from thirdparty.pano2room.modules.geo_predictors.omnidata.omnidata_normal_predictor import (
+        OmnidataNormalPredictor,
+    )
+    from thirdparty.pano2room.modules.geo_predictors.omnidata.omnidata_predictor import (
+        OmnidataPredictor,
+    )
+    def patched_omni_depth_init(self):
+        self.img_size = 384
+        self.model = torch.hub.load(
+            'alexsax/omnidata_models', 'depth_dpt_hybrid_384'
+        )
+        self.model.eval()
+        self.trans_totensor = transforms.Compose(
+            [
+                transforms.Resize(self.img_size, interpolation=Image.BILINEAR),
+                transforms.CenterCrop(self.img_size),
+                transforms.Normalize(mean=0.5, std=0.5),
+            ]
+        )
+    OmnidataPredictor.__init__ = patched_omni_depth_init
+    def patched_omni_normal_init(self):
+        self.img_size = 384
+        self.model = torch.hub.load(
+            'alexsax/omnidata_models', 'surface_normal_dpt_hybrid_384'
+        )
+        self.model.eval()
+        self.trans_totensor = transforms.Compose(
+            [
+                transforms.Resize(self.img_size, interpolation=Image.BILINEAR),
+                transforms.CenterCrop(self.img_size),
+                transforms.Normalize(mean=0.5, std=0.5),
+            ]
+        )
+    OmnidataNormalPredictor.__init__ = patched_omni_normal_init
+    def patched_panojoint_init(self, save_path=None):
+        self.depth_predictor = OmnidataPredictor()
+        self.normal_predictor = OmnidataNormalPredictor()
+        self.save_path = save_path
+    from modules.geo_predictors import PanoJointPredictor
+    PanoJointPredictor.__init__ = patched_panojoint_init
+    # NOTE: We use gsplat instead.
+    # import depth_diff_gaussian_rasterization_min as ddgr
+    # from dataclasses import dataclass
+    # @dataclass
+    # class PatchedGaussianRasterizationSettings:
+    #     image_height: int
+    #     image_width: int
+    #     tanfovx: float
+    #     tanfovy: float
+    #     bg: torch.Tensor
+    #     scale_modifier: float
+    #     viewmatrix: torch.Tensor
+    #     projmatrix: torch.Tensor
+    #     sh_degree: int
+    #     campos: torch.Tensor
+    #     prefiltered: bool
+    #     debug: bool = False
+    # ddgr.GaussianRasterizationSettings = PatchedGaussianRasterizationSettings
+    # disable get_has_ddp_rank print in `BaseInpaintingTrainingModule`
+    os.environ["NODE_RANK"] = "0"
+    from thirdparty.pano2room.modules.inpainters.lama.saicinpainting.training.trainers import (
+        load_checkpoint,
+    )
+    from thirdparty.pano2room.modules.inpainters.lama_inpainter import (
+        LamaInpainter,
+    )
+    def patched_lama_inpaint_init(self):
+        zip_path = hf_hub_download(
+            repo_id="smartywu/big-lama",
+            filename="big-lama.zip",
+            repo_type="model",
+        )
+        extract_dir = os.path.splitext(zip_path)[0]
+        if not os.path.exists(extract_dir):
+            os.makedirs(extract_dir, exist_ok=True)
+            with zipfile.ZipFile(zip_path, "r") as zip_ref:
+                zip_ref.extractall(extract_dir)
+        config_path = os.path.join(extract_dir, 'big-lama', 'config.yaml')
+        checkpoint_path = os.path.join(
+            extract_dir, 'big-lama/models/best.ckpt'
+        )
+        train_config = OmegaConf.load(config_path)
+        train_config.training_model.predict_only = True
+        train_config.visualizer.kind = 'noop'
+        self.model = load_checkpoint(
+            train_config, checkpoint_path, strict=False, map_location='cpu'
+        )
+        self.model.freeze()
+    LamaInpainter.__init__ = patched_lama_inpaint_init
+    from diffusers import StableDiffusionInpaintPipeline
+    from thirdparty.pano2room.modules.inpainters.SDFT_inpainter import (
+        SDFTInpainter,
+    )
+    def patched_sd_inpaint_init(self, subset_name=None):
+        super(SDFTInpainter, self).__init__()
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-2-inpainting",
+            torch_dtype=torch.float16,
+        ).to("cuda")
+        pipe.enable_model_cpu_offload()
+        self.inpaint_pipe = pipe
+    SDFTInpainter.__init__ = patched_sd_inpaint_init

embodied_gen/utils/process_media.py CHANGED Viewed

@@ -15,34 +15,25 @@
 # permissions and limitations under the License.
-import base64
 import logging
 import math
 import os
-import sys
 from glob import glob
-from io import BytesIO
 from typing import Union
 import cv2
 import imageio
 import numpy as np
-import PIL.Image as Image
 import spaces
-import torch
 from moviepy.editor import VideoFileClip, clips_array
-from tqdm import tqdm
 from embodied_gen.data.differentiable_render import entrypoint as render_api
-current_file_path = os.path.abspath(__file__)
-current_dir = os.path.dirname(current_file_path)
-sys.path.append(os.path.join(current_dir, "../.."))
-from thirdparty.TRELLIS.trellis.renderers.mesh_renderer import MeshRenderer
-from thirdparty.TRELLIS.trellis.representations import MeshExtractResult
-from thirdparty.TRELLIS.trellis.utils.render_utils import (
-    render_frames,
-    yaw_pitch_r_fov_to_extrinsics_intrinsics,
-)
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -53,9 +44,11 @@ __all__ = [
     "merge_images_video",
     "filter_small_connected_components",
     "filter_image_small_connected_components",
-    "combine_images_to_base64",
-    "render_mesh",
-    "render_video",
 ]
@@ -66,12 +59,14 @@ def render_asset3d(
     distance: float = 5.0,
     num_images: int = 1,
     elevation: list[float] = (0.0,),
-    pbr_light_factor: float = 1.5,
     return_key: str = "image_color/*",
     output_subdir: str = "renders",
     gen_color_mp4: bool = False,
     gen_viewnormal_mp4: bool = False,
     gen_glonormal_mp4: bool = False,
 ) -> list[str]:
     input_args = dict(
         mesh_path=mesh_path,
@@ -81,14 +76,13 @@ def render_asset3d(
         num_images=num_images,
         elevation=elevation,
         pbr_light_factor=pbr_light_factor,
-        with_mtl=True,
     )
-    if gen_color_mp4:
-        input_args["gen_color_mp4"] = True
-    if gen_viewnormal_mp4:
-        input_args["gen_viewnormal_mp4"] = True
-    if gen_glonormal_mp4:
-        input_args["gen_glonormal_mp4"] = True
     try:
         _ = render_api(**input_args)
     except Exception as e:
@@ -168,12 +162,15 @@ def filter_image_small_connected_components(
     return image
-def combine_images_to_base64(
     images: list[str | Image.Image],
     cat_row_col: tuple[int, int] = None,
     target_wh: tuple[int, int] = (512, 512),
-) -> str:
     n_images = len(images)
     if cat_row_col is None:
         n_col = math.ceil(math.sqrt(n_images))
         n_row = math.ceil(n_images / n_col)
@@ -182,88 +179,229 @@ def combine_images_to_base64(
     images = [
         Image.open(p).convert("RGB") if isinstance(p, str) else p
-        for p in images[: n_row * n_col]
     ]
     images = [img.resize(target_wh) for img in images]
     grid_w, grid_h = n_col * target_wh[0], n_row * target_wh[1]
-    grid = Image.new("RGB", (grid_w, grid_h), (255, 255, 255))
     for idx, img in enumerate(images):
         row, col = divmod(idx, n_col)
         grid.paste(img, (col * target_wh[0], row * target_wh[1]))
-    buffer = BytesIO()
-    grid.save(buffer, format="PNG")
-    return base64.b64encode(buffer.getvalue()).decode("utf-8")
-@spaces.GPU
-def render_mesh(sample, extrinsics, intrinsics, options={}, **kwargs):
-    renderer = MeshRenderer()
-    renderer.rendering_options.resolution = options.get("resolution", 512)
-    renderer.rendering_options.near = options.get("near", 1)
-    renderer.rendering_options.far = options.get("far", 100)
-    renderer.rendering_options.ssaa = options.get("ssaa", 4)
-    rets = {}
-    for extr, intr in tqdm(zip(extrinsics, intrinsics), desc="Rendering"):
-        res = renderer.render(sample, extr, intr)
-        if "normal" not in rets:
-            rets["normal"] = []
-        normal = torch.lerp(
-            torch.zeros_like(res["normal"]), res["normal"], res["mask"]
         )
-        normal = np.clip(
-            normal.detach().cpu().numpy().transpose(1, 2, 0) * 255, 0, 255
-        ).astype(np.uint8)
-        rets["normal"].append(normal)
-    return rets
-@spaces.GPU
-def render_video(
-    sample,
-    resolution=512,
-    bg_color=(0, 0, 0),
-    num_frames=300,
-    r=2,
-    fov=40,
-    **kwargs,
-):
-    yaws = torch.linspace(0, 2 * 3.1415, num_frames)
-    yaws = yaws.tolist()
-    pitch = [0.5] * num_frames
-    extrinsics, intrinsics = yaw_pitch_r_fov_to_extrinsics_intrinsics(
-        yaws, pitch, r, fov
-    )
-    render_fn = (
-        render_mesh if isinstance(sample, MeshExtractResult) else render_frames
-    )
-    result = render_fn(
-        sample,
-        extrinsics,
-        intrinsics,
-        {"resolution": resolution, "bg_color": bg_color},
-        **kwargs,
-    )
-    return result
 if __name__ == "__main__":
-    # Example usage:
     merge_video_video(
         "outputs/imageto3d/room_bottle7/room_bottle_007/URDF_room_bottle_007/mesh_glo_normal.mp4",  # noqa
         "outputs/imageto3d/room_bottle7/room_bottle_007/URDF_room_bottle_007/mesh.mp4",  # noqa
         "merge.mp4",
     )
-    image_base64 = combine_images_to_base64(
-        [
-            "apps/assets/example_image/sample_00.jpg",
-            "apps/assets/example_image/sample_01.jpg",
-            "apps/assets/example_image/sample_02.jpg",
-        ]
-    )

 # permissions and limitations under the License.
 import logging
 import math
+import mimetypes
 import os
+import textwrap
 from glob import glob
 from typing import Union
 import cv2
 import imageio
+import matplotlib.pyplot as plt
+import networkx as nx
 import numpy as np
 import spaces
+from matplotlib.patches import Patch
 from moviepy.editor import VideoFileClip, clips_array
+from PIL import Image
 from embodied_gen.data.differentiable_render import entrypoint as render_api
+from embodied_gen.utils.enum import LayoutInfo, Scene3DItemEnum
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
     "merge_images_video",
     "filter_small_connected_components",
     "filter_image_small_connected_components",
+    "combine_images_to_grid",
+    "SceneTreeVisualizer",
+    "is_image_file",
+    "parse_text_prompts",
+    "check_object_edge_truncated",
 ]
     distance: float = 5.0,
     num_images: int = 1,
     elevation: list[float] = (0.0,),
+    pbr_light_factor: float = 1.2,
     return_key: str = "image_color/*",
     output_subdir: str = "renders",
     gen_color_mp4: bool = False,
     gen_viewnormal_mp4: bool = False,
     gen_glonormal_mp4: bool = False,
+    no_index_file: bool = False,
+    with_mtl: bool = True,
 ) -> list[str]:
     input_args = dict(
         mesh_path=mesh_path,
         num_images=num_images,
         elevation=elevation,
         pbr_light_factor=pbr_light_factor,
+        with_mtl=with_mtl,
+        gen_color_mp4=gen_color_mp4,
+        gen_viewnormal_mp4=gen_viewnormal_mp4,
+        gen_glonormal_mp4=gen_glonormal_mp4,
+        no_index_file=no_index_file,
     )
     try:
         _ = render_api(**input_args)
     except Exception as e:
     return image
+def combine_images_to_grid(
     images: list[str | Image.Image],
     cat_row_col: tuple[int, int] = None,
     target_wh: tuple[int, int] = (512, 512),
+) -> list[str | Image.Image]:
     n_images = len(images)
+    if n_images == 1:
+        return images
     if cat_row_col is None:
         n_col = math.ceil(math.sqrt(n_images))
         n_row = math.ceil(n_images / n_col)
     images = [
         Image.open(p).convert("RGB") if isinstance(p, str) else p
+        for p in images
     ]
     images = [img.resize(target_wh) for img in images]
     grid_w, grid_h = n_col * target_wh[0], n_row * target_wh[1]
+    grid = Image.new("RGB", (grid_w, grid_h), (0, 0, 0))
     for idx, img in enumerate(images):
         row, col = divmod(idx, n_col)
         grid.paste(img, (col * target_wh[0], row * target_wh[1]))
+    return [grid]
+class SceneTreeVisualizer:
+    def __init__(self, layout_info: LayoutInfo) -> None:
+        self.tree = layout_info.tree
+        self.relation = layout_info.relation
+        self.objs_desc = layout_info.objs_desc
+        self.G = nx.DiGraph()
+        self.root = self._find_root()
+        self._build_graph()
+        self.role_colors = {
+            Scene3DItemEnum.BACKGROUND.value: "plum",
+            Scene3DItemEnum.CONTEXT.value: "lightblue",
+            Scene3DItemEnum.ROBOT.value: "lightcoral",
+            Scene3DItemEnum.MANIPULATED_OBJS.value: "lightgreen",
+            Scene3DItemEnum.DISTRACTOR_OBJS.value: "lightgray",
+            Scene3DItemEnum.OTHERS.value: "orange",
+        }
+    def _find_root(self) -> str:
+        children = {c for cs in self.tree.values() for c, _ in cs}
+        parents = set(self.tree.keys())
+        roots = parents - children
+        if not roots:
+            raise ValueError("No root node found.")
+        return next(iter(roots))
+    def _build_graph(self):
+        for parent, children in self.tree.items():
+            for child, relation in children:
+                self.G.add_edge(parent, child, relation=relation)
+    def _get_node_role(self, node: str) -> str:
+        if node == self.relation.get(Scene3DItemEnum.BACKGROUND.value):
+            return Scene3DItemEnum.BACKGROUND.value
+        if node == self.relation.get(Scene3DItemEnum.CONTEXT.value):
+            return Scene3DItemEnum.CONTEXT.value
+        if node == self.relation.get(Scene3DItemEnum.ROBOT.value):
+            return Scene3DItemEnum.ROBOT.value
+        if node in self.relation.get(
+            Scene3DItemEnum.MANIPULATED_OBJS.value, []
+        ):
+            return Scene3DItemEnum.MANIPULATED_OBJS.value
+        if node in self.relation.get(
+            Scene3DItemEnum.DISTRACTOR_OBJS.value, []
+        ):
+            return Scene3DItemEnum.DISTRACTOR_OBJS.value
+        return Scene3DItemEnum.OTHERS.value
+    def _get_positions(
+        self, root, width=1.0, vert_gap=0.1, vert_loc=1, xcenter=0.5, pos=None
+    ):
+        if pos is None:
+            pos = {root: (xcenter, vert_loc)}
+        else:
+            pos[root] = (xcenter, vert_loc)
+        children = list(self.G.successors(root))
+        if children:
+            dx = width / len(children)
+            next_x = xcenter - width / 2 - dx / 2
+            for child in children:
+                next_x += dx
+                pos = self._get_positions(
+                    child,
+                    width=dx,
+                    vert_gap=vert_gap,
+                    vert_loc=vert_loc - vert_gap,
+                    xcenter=next_x,
+                    pos=pos,
+                )
+        return pos
+    def render(
+        self,
+        save_path: str,
+        figsize=(8, 6),
+        dpi=300,
+        title: str = "Scene 3D Hierarchy Tree",
+    ):
+        node_colors = [
+            self.role_colors[self._get_node_role(n)] for n in self.G.nodes
+        ]
+        pos = self._get_positions(self.root)
+        plt.figure(figsize=figsize)
+        nx.draw(
+            self.G,
+            pos,
+            with_labels=True,
+            arrows=False,
+            node_size=2000,
+            node_color=node_colors,
+            font_size=10,
+            font_weight="bold",
+        )
+        # Draw edge labels
+        edge_labels = nx.get_edge_attributes(self.G, "relation")
+        nx.draw_networkx_edge_labels(
+            self.G,
+            pos,
+            edge_labels=edge_labels,
+            font_size=9,
+            font_color="black",
+        )
+        # Draw small description text under each node (if available)
+        for node, (x, y) in pos.items():
+            desc = self.objs_desc.get(node)
+            if desc:
+                wrapped = "\n".join(textwrap.wrap(desc, width=30))
+                plt.text(
+                    x,
+                    y - 0.006,
+                    wrapped,
+                    fontsize=6,
+                    ha="center",
+                    va="top",
+                    wrap=True,
+                    color="black",
+                    bbox=dict(
+                        facecolor="dimgray",
+                        edgecolor="darkgray",
+                        alpha=0.1,
+                        boxstyle="round,pad=0.2",
+                    ),
+                )
+        plt.title(title, fontsize=12)
+        task_desc = self.relation.get("task_desc", "")
+        if task_desc:
+            plt.suptitle(
+                f"Task Description: {task_desc}", fontsize=10, y=0.999
+            )
+        plt.axis("off")
+        legend_handles = [
+            Patch(facecolor=color, edgecolor='black', label=role)
+            for role, color in self.role_colors.items()
+        ]
+        plt.legend(
+            handles=legend_handles,
+            loc="lower center",
+            ncol=3,
+            bbox_to_anchor=(0.5, -0.1),
+            fontsize=9,
         )
+        os.makedirs(os.path.dirname(save_path), exist_ok=True)
+        plt.savefig(save_path, dpi=dpi, bbox_inches="tight")
+        plt.close()
+def load_scene_dict(file_path: str) -> dict:
+    scene_dict = {}
+    with open(file_path, "r", encoding='utf-8') as f:
+        for line in f:
+            line = line.strip()
+            if not line or ":" not in line:
+                continue
+            scene_id, desc = line.split(":", 1)
+            scene_dict[scene_id.strip()] = desc.strip()
+    return scene_dict
+def is_image_file(filename: str) -> bool:
+    mime_type, _ = mimetypes.guess_type(filename)
+    return mime_type is not None and mime_type.startswith('image')
+def parse_text_prompts(prompts: list[str]) -> list[str]:
+    if len(prompts) == 1 and prompts[0].endswith(".txt"):
+        with open(prompts[0], "r") as f:
+            prompts = [
+                line.strip()
+                for line in f
+                if line.strip() and not line.strip().startswith("#")
+            ]
+    return prompts
+def check_object_edge_truncated(
+    mask: np.ndarray, edge_threshold: int = 5
+) -> bool:
+    """Checks if a binary object mask is truncated at the image edges.
+    Args:
+        mask: A 2D binary NumPy array where nonzero values indicate the object region.
+        edge_threshold: Number of pixels from each image edge to consider for truncation.
+            Defaults to 5.
+    Returns:
+        True if the object is fully enclosed (not truncated).
+        False if the object touches or crosses any image boundary.
+    """
+    top = mask[:edge_threshold, :].any()
+    bottom = mask[-edge_threshold:, :].any()
+    left = mask[:, :edge_threshold].any()
+    right = mask[:, -edge_threshold:].any()
+    return not (top or bottom or left or right)
 if __name__ == "__main__":
     merge_video_video(
         "outputs/imageto3d/room_bottle7/room_bottle_007/URDF_room_bottle_007/mesh_glo_normal.mp4",  # noqa
         "outputs/imageto3d/room_bottle7/room_bottle_007/URDF_room_bottle_007/mesh.mp4",  # noqa
         "merge.mp4",
     )

embodied_gen/utils/tags.py CHANGED Viewed

	@@ -1 +1 @@
1	- VERSION = "v0.1.0"


1	+ VERSION = "v0.1.2"

embodied_gen/utils/trender.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# Project EmbodiedGen
+#
+# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+import os
+import sys
+import numpy as np
+import spaces
+import torch
+from tqdm import tqdm
+current_file_path = os.path.abspath(__file__)
+current_dir = os.path.dirname(current_file_path)
+sys.path.append(os.path.join(current_dir, "../.."))
+from thirdparty.TRELLIS.trellis.renderers.mesh_renderer import MeshRenderer
+from thirdparty.TRELLIS.trellis.representations import MeshExtractResult
+from thirdparty.TRELLIS.trellis.utils.render_utils import (
+    render_frames,
+    yaw_pitch_r_fov_to_extrinsics_intrinsics,
+)
+__all__ = [
+    "render_video",
+]
+@spaces.GPU
+def render_mesh(sample, extrinsics, intrinsics, options={}, **kwargs):
+    renderer = MeshRenderer()
+    renderer.rendering_options.resolution = options.get("resolution", 512)
+    renderer.rendering_options.near = options.get("near", 1)
+    renderer.rendering_options.far = options.get("far", 100)
+    renderer.rendering_options.ssaa = options.get("ssaa", 4)
+    rets = {}
+    for extr, intr in tqdm(zip(extrinsics, intrinsics), desc="Rendering"):
+        res = renderer.render(sample, extr, intr)
+        if "normal" not in rets:
+            rets["normal"] = []
+        normal = torch.lerp(
+            torch.zeros_like(res["normal"]), res["normal"], res["mask"]
+        )
+        normal = np.clip(
+            normal.detach().cpu().numpy().transpose(1, 2, 0) * 255, 0, 255
+        ).astype(np.uint8)
+        rets["normal"].append(normal)
+    return rets
+@spaces.GPU
+def render_video(
+    sample,
+    resolution=512,
+    bg_color=(0, 0, 0),
+    num_frames=300,
+    r=2,
+    fov=40,
+    **kwargs,
+):
+    yaws = torch.linspace(0, 2 * 3.1415, num_frames)
+    yaws = yaws.tolist()
+    pitch = [0.5] * num_frames
+    extrinsics, intrinsics = yaw_pitch_r_fov_to_extrinsics_intrinsics(
+        yaws, pitch, r, fov
+    )
+    render_fn = (
+        render_mesh if isinstance(sample, MeshExtractResult) else render_frames
+    )
+    result = render_fn(
+        sample,
+        extrinsics,
+        intrinsics,
+        {"resolution": resolution, "bg_color": bg_color},
+        **kwargs,
+    )
+    return result

embodied_gen/validators/aesthetic_predictor.py CHANGED Viewed

@@ -102,7 +102,7 @@ class AestheticPredictor:
     def _load_sac_model(self, model_path, input_size):
         """Load the SAC model."""
         model = self.MLP(input_size)
-        ckpt = torch.load(model_path)
         model.load_state_dict(ckpt)
         model.to(self.device)
         model.eval()
@@ -135,15 +135,3 @@ class AestheticPredictor:
             )
         return prediction.item()
-if __name__ == "__main__":
-    # Configuration
-    img_path = "apps/assets/example_image/sample_00.jpg"
-    # Initialize the predictor
-    predictor = AestheticPredictor()
-    # Predict the aesthetic score
-    score = predictor.predict(img_path)
-    print("Aesthetic score predicted by the model:", score)

     def _load_sac_model(self, model_path, input_size):
         """Load the SAC model."""
         model = self.MLP(input_size)
+        ckpt = torch.load(model_path, weights_only=True)
         model.load_state_dict(ckpt)
         model.to(self.device)
         model.eval()
             )
         return prediction.item()

embodied_gen/validators/quality_checkers.py CHANGED Viewed

@@ -16,17 +16,29 @@
 import logging
-import os
-from tqdm import tqdm
 from embodied_gen.utils.gpt_clients import GPT_CLIENT, GPTclient
-from embodied_gen.utils.process_media import render_asset3d
 from embodied_gen.validators.aesthetic_predictor import AestheticPredictor
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class BaseChecker:
     def __init__(self, prompt: str = None, verbose: bool = False) -> None:
         self.prompt = prompt
@@ -37,16 +49,20 @@ class BaseChecker:
             "Subclasses must implement the query method."
         )
-    def __call__(self, *args, **kwargs) -> bool:
         response = self.query(*args, **kwargs)
-        if response is None:
-            response = "Error when calling gpt api."
-        if self.verbose and response != "YES":
             logger.info(response)
-        flag = "YES" in response
-        response = "YES" if flag else response
         return flag, response
@@ -92,21 +108,29 @@ class MeshGeoChecker(BaseChecker):
         self.gpt_client = gpt_client
         if self.prompt is None:
             self.prompt = """
-            Refer to the provided multi-view rendering images to evaluate
-            whether the geometry of the 3D object asset is complete and
-            whether the asset can be placed stably on the ground.
-            Return "YES" only if reach the requirments,
-            otherwise "NO" and explain the reason very briefly.
             """
-    def query(self, image_paths: str) -> str:
-        # Hardcode tmp because of the openrouter can't input multi images.
-        if "openrouter" in self.gpt_client.endpoint:
-            from embodied_gen.utils.process_media import (
-                combine_images_to_base64,
-            )
-            image_paths = combine_images_to_base64(image_paths)
         return self.gpt_client.query(
             text_prompt=self.prompt,
@@ -137,14 +161,19 @@ class ImageSegChecker(BaseChecker):
         self.gpt_client = gpt_client
         if self.prompt is None:
             self.prompt = """
-            The first image is the original, and the second image is the
-            result after segmenting the main object. Evaluate the segmentation
-            quality to ensure the main object is clearly segmented without
-            significant truncation. Note that the foreground of the object
-            needs to be extracted instead of the background.
-            Minor imperfections can be ignored. If segmentation is acceptable,
-            return "YES" only; otherwise, return "NO" with
-            very brief explanation.
             """
     def query(self, image_paths: list[str]) -> str:
@@ -152,13 +181,6 @@ class ImageSegChecker(BaseChecker):
             raise ValueError(
                 "ImageSegChecker requires exactly two images: [raw_image, seg_image]."  # noqa
             )
-        # Hardcode tmp because of the openrouter can't input multi images.
-        if "openrouter" in self.gpt_client.endpoint:
-            from embodied_gen.utils.process_media import (
-                combine_images_to_base64,
-            )
-            image_paths = combine_images_to_base64(image_paths)
         return self.gpt_client.query(
             text_prompt=self.prompt,
@@ -201,42 +223,358 @@ class ImageAestheticChecker(BaseChecker):
         return avg_score > self.thresh, avg_score
-if __name__ == "__main__":
-    geo_checker = MeshGeoChecker(GPT_CLIENT)
-    seg_checker = ImageSegChecker(GPT_CLIENT)
-    aesthetic_checker = ImageAestheticChecker()
-    checkers = [geo_checker, seg_checker, aesthetic_checker]
-    output_root = "outputs/test_gpt"
-    fails = []
-    for idx in tqdm(range(150)):
-        mesh_path = f"outputs/imageto3d/demo_objects/cups/sample_{idx}/sample_{idx}.obj"  # noqa
-        if not os.path.exists(mesh_path):
-            continue
-        image_paths = render_asset3d(
-            mesh_path,
-            f"{output_root}/{idx}",
-            num_images=8,
-            elevation=(30, -30),
-            distance=5.5,
         )
-        for cid, checker in enumerate(checkers):
-            if isinstance(checker, ImageSegChecker):
-                images = [
-                    f"outputs/imageto3d/demo_objects/cups/sample_{idx}/sample_{idx}_raw.png",  # noqa
-                    f"outputs/imageto3d/demo_objects/cups/sample_{idx}/sample_{idx}_cond.png",  # noqa
-                ]
-            else:
-                images = image_paths
-            result, info = checker(images)
-            logger.info(
-                f"Checker {checker.__class__.__name__}: {result}, {info}, mesh {mesh_path}"  # noqa
             )
-            if result is False:
-                fails.append((idx, cid, info))
-        break

 import logging
+import random
+import json_repair
+from PIL import Image
 from embodied_gen.utils.gpt_clients import GPT_CLIENT, GPTclient
 from embodied_gen.validators.aesthetic_predictor import AestheticPredictor
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+__all__ = [
+    "MeshGeoChecker",
+    "ImageSegChecker",
+    "ImageAestheticChecker",
+    "SemanticConsistChecker",
+    "TextGenAlignChecker",
+    "PanoImageGenChecker",
+    "PanoHeightEstimator",
+    "PanoImageOccChecker",
+]
 class BaseChecker:
     def __init__(self, prompt: str = None, verbose: bool = False) -> None:
         self.prompt = prompt
             "Subclasses must implement the query method."
         )
+    def __call__(self, *args, **kwargs) -> tuple[bool, str]:
         response = self.query(*args, **kwargs)
+        if self.verbose:
             logger.info(response)
+        if response is None:
+            flag = None
+            response = (
+                "Error when calling GPT api, check config in "
+                "`embodied_gen/utils/gpt_config.yaml` or net connection."
+            )
+        else:
+            flag = "YES" in response
+            response = "YES" if flag else response
         return flag, response
         self.gpt_client = gpt_client
         if self.prompt is None:
             self.prompt = """
+            You are an expert in evaluating the geometry quality of generated 3D asset.
+            You will be given rendered views of a generated 3D asset with black background.
+            Your task is to evaluate the quality of the 3D asset generation,
+            including geometry, structure, and appearance, based on the rendered views.
+            Criteria:
+            - Is the object in the image a single, complete, and well-formed instance,
+                without truncation, missing parts, overlapping duplicates, or redundant geometry?
+            - Minor flaws, asymmetries, or simplifications (e.g., less detail on sides or back,
+                soft edges) are acceptable if the object is structurally sound and recognizable.
+            - Only evaluate geometry. Do not assess texture quality.
+            - The asset should not contain any unrelated elements, such as
+                ground planes, platforms, or background props (e.g., paper, flooring).
+            If all the above criteria are met, return "YES". Otherwise, return
+                "NO" followed by a brief explanation (no more than 20 words).
+            Example:
+            Images show a yellow cup standing on a flat white plane -> NO
+            -> Response: NO: extra white surface under the object.
+            Image shows a chair with simplified back legs and soft edges → YES
             """
+    def query(self, image_paths: list[str | Image.Image]) -> str:
         return self.gpt_client.query(
             text_prompt=self.prompt,
         self.gpt_client = gpt_client
         if self.prompt is None:
             self.prompt = """
+            Task: Evaluate the quality of object segmentation between two images:
+                the first is the original, the second is the segmented result.
+            Criteria:
+            - The main foreground object should be clearly extracted (not the background).
+            - The object must appear realistic, with reasonable geometry and color.
+            - The object should be geometrically complete — no missing, truncated, or cropped parts.
+            - The object must be centered, with a margin on all sides.
+            - Ignore minor imperfections (e.g., small holes or fine edge artifacts).
+            Output Rules:
+            If segmentation is acceptable, respond with "YES" (and nothing else).
+            If not acceptable, respond with "NO", followed by a brief reason (max 20 words).
             """
     def query(self, image_paths: list[str]) -> str:
             raise ValueError(
                 "ImageSegChecker requires exactly two images: [raw_image, seg_image]."  # noqa
             )
         return self.gpt_client.query(
             text_prompt=self.prompt,
         return avg_score > self.thresh, avg_score
+class SemanticConsistChecker(BaseChecker):
+    def __init__(
+        self,
+        gpt_client: GPTclient,
+        prompt: str = None,
+        verbose: bool = False,
+    ) -> None:
+        super().__init__(prompt, verbose)
+        self.gpt_client = gpt_client
+        if self.prompt is None:
+            self.prompt = """
+            You are an expert in image-text consistency assessment.
+            You will be given:
+            - A short text description of an object.
+            - An segmented image of the same object with the background removed.
+            Criteria:
+            - The image must visually match the text description in terms of object type, structure, geometry, and color.
+            - The object must appear realistic, with reasonable geometry (e.g., a table must have a stable number
+                of legs with a reasonable distribution. Count the number of legs visible in the image. (strict) For tables,
+                fewer than four legs or if the legs are unevenly distributed, are not allowed. Do not assume
+                hidden legs unless they are clearly visible.)
+            - Geometric completeness is required: the object must not have missing, truncated, or cropped parts.
+            - The image must contain exactly one object. Multiple distinct objects are not allowed.
+                A single composite object (e.g., a chair with legs) is acceptable.
+            - The object should be shown from a slightly angled (three-quarter) perspective,
+                not a flat, front-facing view showing only one surface.
+            Instructions:
+            - If all criteria are met, return `"YES"`.
+            - Otherwise, return "NO" with a brief explanation (max 20 words).
+            Respond in exactly one of the following formats:
+            YES
+            or
+            NO: brief explanation.
+            Input:
+            {}
+            """
+    def query(self, text: str, image: list[Image.Image | str]) -> str:
+        return self.gpt_client.query(
+            text_prompt=self.prompt.format(text),
+            image_base64=image,
         )
+class TextGenAlignChecker(BaseChecker):
+    def __init__(
+        self,
+        gpt_client: GPTclient,
+        prompt: str = None,
+        verbose: bool = False,
+    ) -> None:
+        super().__init__(prompt, verbose)
+        self.gpt_client = gpt_client
+        if self.prompt is None:
+            self.prompt = """
+            You are an expert in evaluating the quality of generated 3D assets.
+            You will be given:
+            - A text description of an object: TEXT
+            - Rendered views of the generated 3D asset.
+            Your task is to:
+            1. Determine whether the generated 3D asset roughly reflects the object class
+                or a semantically adjacent category described in the text.
+            2. Evaluate the geometry quality of the 3D asset generation based on the rendered views.
+            Criteria:
+            - Determine if the generated 3D asset belongs to the text described or a similar category.
+            - Focus on functional similarity: if the object serves the same general
+                purpose (e.g., writing, placing items), it should be accepted.
+            - Is the geometry complete and well-formed, with no missing parts,
+            distortions, visual artifacts, or redundant structures?
+            - Does the number of object instances match the description?
+                There should be only one object unless otherwise specified.
+            - Minor flaws in geometry or texture are acceptable, high tolerance for texture quality defects.
+            - Minor simplifications in geometry or texture (e.g. soft edges, less detail)
+                are acceptable if the object is still recognizable.
+            - The asset should not contain any unrelated elements, such as
+                ground planes, platforms, or background props (e.g., paper, flooring).
+            Example:
+            Text: "yellow cup"
+            Image: shows a yellow cup standing on a flat white plane -> NO: extra surface under the object.
+            Instructions:
+            - If the quality of generated asset is acceptable and faithfully represents the text, return "YES".
+            - Otherwise, return "NO" followed by a brief explanation (no more than 20 words).
+            Respond in exactly one of the following formats:
+            YES
+            or
+            NO: brief explanation
+            Input:
+            Text description: {}
+            """
+    def query(self, text: str, image: list[Image.Image | str]) -> str:
+        return self.gpt_client.query(
+            text_prompt=self.prompt.format(text),
+            image_base64=image,
+        )
+class PanoImageGenChecker(BaseChecker):
+    """A checker class that validates the quality and realism of generated panoramic indoor images.
+    Attributes:
+        gpt_client (GPTclient): A GPT client instance used to query for image validation.
+        prompt (str): The instruction prompt passed to the GPT model. If None, a default prompt is used.
+        verbose (bool): Whether to print internal processing information for debugging.
+    """
+    def __init__(
+        self,
+        gpt_client: GPTclient,
+        prompt: str = None,
+        verbose: bool = False,
+    ) -> None:
+        super().__init__(prompt, verbose)
+        self.gpt_client = gpt_client
+        if self.prompt is None:
+            self.prompt = """
+            You are a panoramic image analyzer specializing in indoor room structure validation.
+            Given a generated panoramic image, assess if it meets all the criteria:
+            - Floor Space: ≥30 percent of the floor is free of objects or obstructions.
+            - Visual Clarity: Floor, walls, and ceiling are clear, with no distortion, blur, noise.
+            - Structural Continuity: Surfaces form plausible, continuous geometry
+                without breaks, floating parts, or abrupt cuts.
+            - Spatial Completeness: Full 360° coverage without missing areas,
+                seams, gaps, or stitching artifacts.
+            Instructions:
+            - If all criteria are met, reply with "YES".
+            - Otherwise, reply with "NO: <brief explanation>" (max 20 words).
+            Respond exactly as:
+            "YES"
+            or
+            "NO: brief explanation."
+            """
+    def query(self, image_paths: str | Image.Image) -> str:
+        return self.gpt_client.query(
+            text_prompt=self.prompt,
+            image_base64=image_paths,
+        )
+class PanoImageOccChecker(BaseChecker):
+    """Checks for physical obstacles in the bottom-center region of a panoramic image.
+    This class crops a specified region from the input panoramic image and uses
+    a GPT client to determine whether any physical obstacles there.
+    Args:
+        gpt_client (GPTclient): The GPT-based client used for visual reasoning.
+        box_hw (tuple[int, int]): The height and width of the crop box.
+        prompt (str, optional): Custom prompt for the GPT client. Defaults to a predefined one.
+        verbose (bool, optional): Whether to print verbose logs. Defaults to False.
+    """
+    def __init__(
+        self,
+        gpt_client: GPTclient,
+        box_hw: tuple[int, int],
+        prompt: str = None,
+        verbose: bool = False,
+    ) -> None:
+        super().__init__(prompt, verbose)
+        self.gpt_client = gpt_client
+        self.box_hw = box_hw
+        if self.prompt is None:
+            self.prompt = """
+            This image is a cropped region from the bottom-center of a panoramic view.
+            Please determine whether there is any obstacle present — such as furniture, tables, or other physical objects.
+            Ignore floor textures, rugs, carpets, shadows, and lighting effects — they do not count as obstacles.
+            Only consider real, physical objects that could block walking or movement.
+            Instructions:
+            - If there is no obstacle, reply: "YES".
+            - Otherwise, reply: "NO: <brief explanation>" (max 20 words).
+            Respond exactly as:
+            "YES"
+            or
+            "NO: brief explanation."
+            """
+    def query(self, image_paths: str | Image.Image) -> str:
+        if isinstance(image_paths, str):
+            image_paths = Image.open(image_paths)
+        w, h = image_paths.size
+        image_paths = image_paths.crop(
+            (
+                (w - self.box_hw[1]) // 2,
+                h - self.box_hw[0],
+                (w + self.box_hw[1]) // 2,
+                h,
+            )
+        )
+        return self.gpt_client.query(
+            text_prompt=self.prompt,
+            image_base64=image_paths,
+        )
+class PanoHeightEstimator(object):
+    """Estimate the real ceiling height of an indoor space from a 360° panoramic image.
+    Attributes:
+        gpt_client (GPTclient): The GPT client used to perform image-based reasoning and return height estimates.
+        default_value (float): The fallback height in meters if parsing the GPT output fails.
+        prompt (str): The textual instruction used to guide the GPT model for height estimation.
+    """
+    def __init__(
+        self,
+        gpt_client: GPTclient,
+        default_value: float = 3.5,
+    ) -> None:
+        self.gpt_client = gpt_client
+        self.default_value = default_value
+        self.prompt = """
+        You are an expert in building height estimation and panoramic image analysis.
+        Your task is to analyze a 360° indoor panoramic image and estimate the **actual height** of the space in meters.
+        Consider the following visual cues:
+        1. Ceiling visibility and reference objects (doors, windows, furniture, appliances).
+        2. Floor features or level differences.
+        3. Room type (e.g., residential, office, commercial).
+        4. Object-to-ceiling proportions (e.g., height of doors relative to ceiling).
+        5. Architectural elements (e.g., chandeliers, shelves, kitchen cabinets).
+        Input: A full 360° panoramic indoor photo.
+        Output: A single number in meters representing the estimated room height. Only return the number (e.g., `3.2`)
+        """
+    def __call__(self, image_paths: str | Image.Image) -> float:
+        result = self.gpt_client.query(
+            text_prompt=self.prompt,
+            image_base64=image_paths,
+        )
+        try:
+            result = float(result.strip())
+        except Exception as e:
+            logger.error(
+                f"Parser error: failed convert {result} to float, {e}, use default value {self.default_value}."
             )
+            result = self.default_value
+        return result
+class SemanticMatcher(BaseChecker):
+    def __init__(
+        self,
+        gpt_client: GPTclient,
+        prompt: str = None,
+        verbose: bool = False,
+        seed: int = None,
+    ) -> None:
+        super().__init__(prompt, verbose)
+        self.gpt_client = gpt_client
+        self.seed = seed
+        random.seed(seed)
+        if self.prompt is None:
+            self.prompt = """
+            You are an expert in semantic similarity and scene retrieval.
+            You will be given:
+            - A dictionary where each key is a scene ID, and each value is a scene description.
+            - A query text describing a target scene.
+            Your task:
+            return_num = 2
+            - Find the <return_num> most semantically similar scene IDs to the query text.
+            - If there are fewer than <return_num> distinct relevant matches, repeat the closest ones to make a list of <return_num>.
+            - Only output the list of <return_num> scene IDs, sorted from most to less similar.
+            - Do NOT use markdown, JSON code blocks, or any formatting syntax, only return a plain list like ["id1", ...].
+            Input example:
+            Dictionary:
+            "{{
+            "t_scene_008": "A study room with full bookshelves and a lamp in the corner.",
+            "t_scene_019": "A child's bedroom with pink walls and a small desk.",
+            "t_scene_020": "A living room with a wooden floor.",
+            "t_scene_021": "A living room with toys scattered on the floor.",
+            ...
+            "t_scene_office_001": "A very spacious, modern open-plan office with wide desks and no people, panoramic view."
+            }}"
+            Text:
+            "A traditional indoor room"
+            Output:
+            '["t_scene_office_001", ...]'
+            Input:
+            Dictionary:
+            {context}
+            Text:
+            {text}
+            Output:
+            <topk_key_list>
+            """
+    def query(
+        self, text: str, context: dict, rand: bool = True, params: dict = None
+    ) -> str:
+        match_list = self.gpt_client.query(
+            self.prompt.format(context=context, text=text),
+            params=params,
+        )
+        match_list = json_repair.loads(match_list)
+        result = random.choice(match_list) if rand else match_list[0]
+        return result
+def test_semantic_matcher(
+    bg_file: str = "outputs/bg_scenes/bg_scene_list.txt",
+):
+    bg_file = "outputs/bg_scenes/bg_scene_list.txt"
+    scene_dict = {}
+    with open(bg_file, "r") as f:
+        for line in f:
+            line = line.strip()
+            if not line or ":" not in line:
+                continue
+            scene_id, desc = line.split(":", 1)
+            scene_dict[scene_id.strip()] = desc.strip()
+    office_scene = scene_dict.get("t_scene_office_001")
+    text = "bright kitchen"
+    SCENE_MATCHER = SemanticMatcher(GPT_CLIENT)
+    # gpt_params = {
+    #     "temperature": 0.8,
+    #     "max_tokens": 500,
+    #     "top_p": 0.8,
+    #     "frequency_penalty": 0.3,
+    #     "presence_penalty": 0.3,
+    # }
+    gpt_params = None
+    match_key = SCENE_MATCHER.query(text, str(scene_dict))
+    print(match_key, ",", scene_dict[match_key])
+if __name__ == "__main__":
+    test_semantic_matcher()

embodied_gen/validators/urdf_convertor.py CHANGED Viewed

@@ -101,34 +101,42 @@ class URDFGenerator(object):
             prompt_template = (
                 view_desc
                 + """of the 3D object asset,
-                category: {category}.
-                You are an expert in 3D object analysis and physical property estimation.
-                Give the category of this object asset (within 3 words),
-                (if category is already provided, use it directly),
-                accurately describe this 3D object asset (within 15 words),
-                and give the recommended geometric height range (unit: meter),
-                weight range (unit: kilogram), the average static friction
-                coefficient of the object relative to rubber and the average
-                dynamic friction coefficient of the object relative to rubber.
-                Return response format as shown in Output Example.
-                IMPORTANT:
-                Inputed images are orthographic projection showing the front, left, right and back views,
-                the first image is always the front view. Use the object's pose and orientation in the
-                rendered images to estimate its **true vertical height as it appears in the image**,
-                not the real-world length or width of the object.
-                For example:
-                - A pen standing upright in the front view → vertical height: 0.15-0.2 m
-                - A pen lying horizontally in the front view → vertical height: 0.01-0.02 m
-                    (based on its thickness in the image)
-                Output Example:
-                Category: cup
-                Description: shiny golden cup with floral design
-                Height: 0.1-0.15 m
-                Weight: 0.3-0.6 kg
-                Static friction coefficient: 1.1
-                Dynamic friction coefficient: 0.9
             """
             )
@@ -297,20 +305,24 @@ class URDFGenerator(object):
         if not os.path.exists(urdf_path):
             raise FileNotFoundError(f"URDF file not found: {urdf_path}")
-        mesh_scale = 1.0
         tree = ET.parse(urdf_path)
         root = tree.getroot()
         extra_info = root.find(attr_root)
         if extra_info is not None:
             scale_element = extra_info.find(attr_name)
             if scale_element is not None:
-                mesh_scale = float(scale_element.text)
-        return mesh_scale
     @staticmethod
     def add_quality_tag(
-        urdf_path: str, results, output_path: str = None
     ) -> None:
         if output_path is None:
             output_path = urdf_path
@@ -366,17 +378,11 @@ class URDFGenerator(object):
             output_root,
             num_images=self.render_view_num,
             output_subdir=self.output_render_dir,
         )
-        # Hardcode tmp because of the openrouter can't input multi images.
-        if "openrouter" in self.gpt_client.endpoint:
-            from embodied_gen.utils.process_media import (
-                combine_images_to_base64,
-            )
-            image_path = combine_images_to_base64(image_path)
         response = self.gpt_client.query(text_prompt, image_path)
         if response is None:
             asset_attrs = {
                 "category": category.lower(),
@@ -412,14 +418,18 @@ class URDFGenerator(object):
 if __name__ == "__main__":
     urdf_gen = URDFGenerator(GPT_CLIENT, render_view_num=4)
     urdf_path = urdf_gen(
-        mesh_path="outputs/imageto3d/cma/o5/URDF_o5/mesh/o5.obj",
         output_root="outputs/test_urdf",
-        # category="coffee machine",
         # min_height=1.0,
         # max_height=1.2,
         version=VERSION,
     )
     # zip_files(
     #     input_paths=[
     #         "scripts/apps/tmp/2umpdum3e5n/URDF_sample/mesh",

             prompt_template = (
                 view_desc
                 + """of the 3D object asset,
+            category: {category}.
+            You are an expert in 3D object analysis and physical property estimation.
+            Give the category of this object asset (within 3 words), (if category is
+            already provided, use it directly), accurately describe this 3D object asset (within 15 words),
+            Determine the pose of the object in the first image and estimate the true vertical height
+            (vertical projection) range of the object (in meters), i.e., how tall the object appears from top
+            to bottom in the front view (first) image. also weight range (unit: kilogram), the average
+            static friction coefficient of the object relative to rubber and the average dynamic friction
+            coefficient of the object relative to rubber. Return response format as shown in Output Example.
+            Output Example:
+            Category: cup
+            Description: shiny golden cup with floral design
+            Height: 0.1-0.15 m
+            Weight: 0.3-0.6 kg
+            Static friction coefficient: 0.6
+            Dynamic friction coefficient: 0.5
+            IMPORTANT: Estimating Vertical Height from the First (Front View) Image.
+            - The "vertical height" refers to the real-world vertical size of the object
+            as projected in the first image, aligned with the image's vertical axis.
+            - For flat objects like plates or disks or book, if their face is visible in the front view,
+            use the diameter as the vertical height. If the edge is visible, use the thickness instead.
+            - This is not necessarily the full length of the object, but how tall it appears
+            in the first image vertically, based on its pose and orientation.
+            - For objects(e.g., spoons, forks, writing instruments etc.) at an angle showing in
+            the first image, tilted at 45° will appear shorter vertically than when upright.
+            Estimate the vertical projection of their real length based on its pose.
+            For example:
+              - A pen standing upright in the first view (aligned with the image's vertical axis)
+              full body visible in the first image: → vertical height ≈ 0.14-0.20 m
+              - A pen lying flat in the front view (showing thickness) → vertical height ≈ 0.018-0.025 m
+              - Tilted pen in the first image (e.g., ~45° angle): vertical height ≈ 0.07-0.12 m
+            - Use the rest views(except the first image) to help determine the object's 3D pose and orientation.
+            Assume the object is in real-world scale and estimate the approximate vertical height
+            (in meters) based on how large it appears vertically in the first image.
             """
             )
         if not os.path.exists(urdf_path):
             raise FileNotFoundError(f"URDF file not found: {urdf_path}")
+        mesh_attr = None
         tree = ET.parse(urdf_path)
         root = tree.getroot()
         extra_info = root.find(attr_root)
         if extra_info is not None:
             scale_element = extra_info.find(attr_name)
             if scale_element is not None:
+                mesh_attr = scale_element.text
+                try:
+                    mesh_attr = float(mesh_attr)
+                except ValueError as e:
+                    pass
+        return mesh_attr
     @staticmethod
     def add_quality_tag(
+        urdf_path: str, results: list, output_path: str = None
     ) -> None:
         if output_path is None:
             output_path = urdf_path
             output_root,
             num_images=self.render_view_num,
             output_subdir=self.output_render_dir,
+            no_index_file=True,
         )
         response = self.gpt_client.query(text_prompt, image_path)
+        # logger.info(response)
         if response is None:
             asset_attrs = {
                 "category": category.lower(),
 if __name__ == "__main__":
     urdf_gen = URDFGenerator(GPT_CLIENT, render_view_num=4)
     urdf_path = urdf_gen(
+        mesh_path="outputs/layout2/asset3d/marker/result/mesh/marker.obj",
         output_root="outputs/test_urdf",
+        category="marker",
         # min_height=1.0,
         # max_height=1.2,
         version=VERSION,
     )
+    URDFGenerator.add_quality_tag(
+        urdf_path, [[urdf_gen.__class__.__name__, "OK"]]
+    )
     # zip_files(
     #     input_paths=[
     #         "scripts/apps/tmp/2umpdum3e5n/URDF_sample/mesh",