1

2026-03-17 11:30:00 +08:00
parent 06659057c3
commit 07ed844f31
6 changed files with 982 additions and 0 deletions
--- a/trellis/datasets/sparse_feat2render.py
+++ b/trellis/datasets/sparse_feat2render.py
@@ -0,0 +1,134 @@
+import os
+from PIL import Image
+import json
+import numpy as np
+import pandas as pd
+import torch
+import utils3d.torch
+from ..modules.sparse.basic import SparseTensor
+from .components import StandardDatasetBase
+
+
+class SparseFeat2Render(StandardDatasetBase):
+    """
+    SparseFeat2Render dataset.
+    
+    Args:
+        roots (str): paths to the dataset
+        image_size (int): size of the image
+        model (str): model name
+        resolution (int): resolution of the data
+        min_aesthetic_score (float): minimum aesthetic score
+        max_num_voxels (int): maximum number of voxels
+    """
+    def __init__(
+        self,
+        roots: str,
+        image_size: int,
+        model: str = 'dinov2_vitl14_reg',
+        resolution: int = 64,
+        min_aesthetic_score: float = 5.0,
+        max_num_voxels: int = 32768,
+    ):
+        self.image_size = image_size
+        self.model = model
+        self.resolution = resolution
+        self.min_aesthetic_score = min_aesthetic_score
+        self.max_num_voxels = max_num_voxels
+        self.value_range = (0, 1)
+        
+        super().__init__(roots)
+        
+    def filter_metadata(self, metadata):
+        stats = {}
+        metadata = metadata[metadata[f'feature_{self.model}']]
+        stats['With features'] = len(metadata)
+        metadata = metadata[metadata['aesthetic_score'] >= self.min_aesthetic_score]
+        stats[f'Aesthetic score >= {self.min_aesthetic_score}'] = len(metadata)
+        metadata = metadata[metadata['num_voxels'] <= self.max_num_voxels]
+        stats[f'Num voxels <= {self.max_num_voxels}'] = len(metadata)
+        return metadata, stats
+
+    def _get_image(self, root, instance):
+        with open(os.path.join(root, 'renders', instance, 'transforms.json')) as f:
+            metadata = json.load(f)
+        n_views = len(metadata['frames'])
+        view = np.random.randint(n_views)
+        metadata = metadata['frames'][view]
+        fov = metadata['camera_angle_x']
+        intrinsics = utils3d.torch.intrinsics_from_fov_xy(torch.tensor(fov), torch.tensor(fov))
+        c2w = torch.tensor(metadata['transform_matrix'])
+        c2w[:3, 1:3] *= -1
+        extrinsics = torch.inverse(c2w)
+
+        image_path = os.path.join(root, 'renders', instance, metadata['file_path'])
+        image = Image.open(image_path)
+        alpha = image.getchannel(3)
+        image = image.convert('RGB')
+        image = image.resize((self.image_size, self.image_size), Image.Resampling.LANCZOS)
+        alpha = alpha.resize((self.image_size, self.image_size), Image.Resampling.LANCZOS)
+        image = torch.tensor(np.array(image)).permute(2, 0, 1).float() / 255.0
+        alpha = torch.tensor(np.array(alpha)).float() / 255.0
+        
+        return {
+            'image': image,
+            'alpha': alpha,
+            'extrinsics': extrinsics,
+            'intrinsics': intrinsics,
+        }
+    
+    def _get_feat(self, root, instance):
+        DATA_RESOLUTION = 64
+        feats_path = os.path.join(root, 'features', self.model, f'{instance}.npz')
+        feats = np.load(feats_path, allow_pickle=True)
+        coords = torch.tensor(feats['indices']).int()
+        feats = torch.tensor(feats['patchtokens']).float()
+        
+        if self.resolution != DATA_RESOLUTION:
+            factor = DATA_RESOLUTION // self.resolution
+            coords = coords // factor
+            coords, idx = coords.unique(return_inverse=True, dim=0)
+            feats = torch.scatter_reduce(
+                torch.zeros(coords.shape[0], feats.shape[1], device=feats.device),
+                dim=0,
+                index=idx.unsqueeze(-1).expand(-1, feats.shape[1]),
+                src=feats,
+                reduce='mean'
+            )
+        
+        return {
+            'coords': coords,
+            'feats': feats,
+        }
+
+    @torch.no_grad()
+    def visualize_sample(self, sample: dict):
+        return sample['image']
+
+    @staticmethod
+    def collate_fn(batch):
+        pack = {}
+        coords = []
+        for i, b in enumerate(batch):
+            coords.append(torch.cat([torch.full((b['coords'].shape[0], 1), i, dtype=torch.int32), b['coords']], dim=-1))
+        coords = torch.cat(coords)
+        feats = torch.cat([b['feats'] for b in batch])
+        pack['feats'] = SparseTensor(
+            coords=coords,
+            feats=feats,
+        )
+        
+        pack['image'] = torch.stack([b['image'] for b in batch])
+        pack['alpha'] = torch.stack([b['alpha'] for b in batch])
+        pack['extrinsics'] = torch.stack([b['extrinsics'] for b in batch])
+        pack['intrinsics'] = torch.stack([b['intrinsics'] for b in batch])
+
+        return pack
+
+    def get_instance(self, root, instance):
+        image = self._get_image(root, instance)
+        feat = self._get_feat(root, instance)
+        return {
+            **image,
+            **feat,
+        }
--- a/trellis/datasets/sparse_structure_latent.py
+++ b/trellis/datasets/sparse_structure_latent.py
@@ -0,0 +1,188 @@
+import os
+import json
+from typing import *
+import numpy as np
+import torch
+import utils3d
+from ..representations.octree import DfsOctree as Octree
+from ..renderers import OctreeRenderer
+from .components import StandardDatasetBase, TextConditionedMixin, ImageConditionedMixin
+from .. import models
+
+
+class SparseStructureLatentVisMixin:
+    def __init__(
+        self,
+        *args,
+        pretrained_ss_dec: str = 'microsoft/TRELLIS-image-large/ckpts/ss_dec_conv3d_16l8_fp16',
+        ss_dec_path: Optional[str] = None,
+        ss_dec_ckpt: Optional[str] = None,
+        **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+        self.ss_dec = None
+        self.pretrained_ss_dec = pretrained_ss_dec
+        self.ss_dec_path = ss_dec_path
+        self.ss_dec_ckpt = ss_dec_ckpt
+        
+    def _loading_ss_dec(self):
+        if self.ss_dec is not None:
+            return
+        if self.ss_dec_path is not None:
+            cfg = json.load(open(os.path.join(self.ss_dec_path, 'config.json'), 'r'))
+            decoder = getattr(models, cfg['models']['decoder']['name'])(**cfg['models']['decoder']['args'])
+            ckpt_path = os.path.join(self.ss_dec_path, 'ckpts', f'decoder_{self.ss_dec_ckpt}.pt')
+            decoder.load_state_dict(torch.load(ckpt_path, map_location='cpu', weights_only=True))
+        else:
+            decoder = models.from_pretrained(self.pretrained_ss_dec)
+        self.ss_dec = decoder.cuda().eval()
+
+    def _delete_ss_dec(self):
+        del self.ss_dec
+        self.ss_dec = None
+
+    @torch.no_grad()
+    def decode_latent(self, z, batch_size=4):
+        self._loading_ss_dec()
+        ss = []
+        if self.normalization is not None:
+            z = z * self.std.to(z.device) + self.mean.to(z.device)
+        for i in range(0, z.shape[0], batch_size):
+            ss.append(self.ss_dec(z[i:i+batch_size]))
+        ss = torch.cat(ss, dim=0)
+        self._delete_ss_dec()
+        return ss
+
+    @torch.no_grad()
+    def visualize_sample(self, x_0: Union[torch.Tensor, dict]):
+        x_0 = x_0 if isinstance(x_0, torch.Tensor) else x_0['x_0']
+        x_0 = self.decode_latent(x_0.cuda())
+        
+        renderer = OctreeRenderer()
+        renderer.rendering_options.resolution = 512
+        renderer.rendering_options.near = 0.8
+        renderer.rendering_options.far = 1.6
+        renderer.rendering_options.bg_color = (0, 0, 0)
+        renderer.rendering_options.ssaa = 4
+        renderer.pipe.primitive = 'voxel'
+        
+        # Build camera
+        yaws = [0, np.pi / 2, np.pi, 3 * np.pi / 2]
+        yaws_offset = np.random.uniform(-np.pi / 4, np.pi / 4)
+        yaws = [y + yaws_offset for y in yaws]
+        pitch = [np.random.uniform(-np.pi / 4, np.pi / 4) for _ in range(4)]
+
+        exts = []
+        ints = []
+        for yaw, pitch in zip(yaws, pitch):
+            orig = torch.tensor([
+                np.sin(yaw) * np.cos(pitch),
+                np.cos(yaw) * np.cos(pitch),
+                np.sin(pitch),
+            ]).float().cuda() * 2
+            fov = torch.deg2rad(torch.tensor(30)).cuda()
+            extrinsics = utils3d.torch.extrinsics_look_at(orig, torch.tensor([0, 0, 0]).float().cuda(), torch.tensor([0, 0, 1]).float().cuda())
+            intrinsics = utils3d.torch.intrinsics_from_fov_xy(fov, fov)
+            exts.append(extrinsics)
+            ints.append(intrinsics)
+
+        images = []
+        
+        # Build each representation
+        x_0 = x_0.cuda()
+        for i in range(x_0.shape[0]):
+            representation = Octree(
+                depth=10,
+                aabb=[-0.5, -0.5, -0.5, 1, 1, 1],
+                device='cuda',
+                primitive='voxel',
+                sh_degree=0,
+                primitive_config={'solid': True},
+            )
+            coords = torch.nonzero(x_0[i, 0] > 0, as_tuple=False)
+            resolution = x_0.shape[-1]
+            representation.position = coords.float() / resolution
+            representation.depth = torch.full((representation.position.shape[0], 1), int(np.log2(resolution)), dtype=torch.uint8, device='cuda')
+
+            image = torch.zeros(3, 1024, 1024).cuda()
+            tile = [2, 2]
+            for j, (ext, intr) in enumerate(zip(exts, ints)):
+                res = renderer.render(representation, ext, intr, colors_overwrite=representation.position)
+                image[:, 512 * (j // tile[1]):512 * (j // tile[1] + 1), 512 * (j % tile[1]):512 * (j % tile[1] + 1)] = res['color']
+            images.append(image)
+            
+        return torch.stack(images)
+       
+
+class SparseStructureLatent(SparseStructureLatentVisMixin, StandardDatasetBase):
+    """
+    Sparse structure latent dataset
+    
+    Args:
+        roots (str): path to the dataset
+        latent_model (str): name of the latent model
+        min_aesthetic_score (float): minimum aesthetic score
+        normalization (dict): normalization stats
+        pretrained_ss_dec (str): name of the pretrained sparse structure decoder
+        ss_dec_path (str): path to the sparse structure decoder, if given, will override the pretrained_ss_dec
+        ss_dec_ckpt (str): name of the sparse structure decoder checkpoint
+    """
+    def __init__(self,
+        roots: str,
+        *,
+        latent_model: str,
+        min_aesthetic_score: float = 5.0,
+        normalization: Optional[dict] = None,
+        pretrained_ss_dec: str = 'microsoft/TRELLIS-image-large/ckpts/ss_dec_conv3d_16l8_fp16',
+        ss_dec_path: Optional[str] = None,
+        ss_dec_ckpt: Optional[str] = None,
+    ):
+        self.latent_model = latent_model
+        self.min_aesthetic_score = min_aesthetic_score
+        self.normalization = normalization
+        self.value_range = (0, 1)
+        
+        super().__init__(
+            roots,
+            pretrained_ss_dec=pretrained_ss_dec,
+            ss_dec_path=ss_dec_path,
+            ss_dec_ckpt=ss_dec_ckpt,
+        )
+        
+        if self.normalization is not None:
+            self.mean = torch.tensor(self.normalization['mean']).reshape(-1, 1, 1, 1)
+            self.std = torch.tensor(self.normalization['std']).reshape(-1, 1, 1, 1)
+  
+    def filter_metadata(self, metadata):
+        stats = {}
+        metadata = metadata[metadata[f'ss_latent_{self.latent_model}']]
+        stats['With sparse structure latents'] = len(metadata)
+        metadata = metadata[metadata['aesthetic_score'] >= self.min_aesthetic_score]
+        stats[f'Aesthetic score >= {self.min_aesthetic_score}'] = len(metadata)
+        return metadata, stats
+                
+    def get_instance(self, root, instance):
+        latent = np.load(os.path.join(root, 'ss_latents', self.latent_model, f'{instance}.npz'))
+        z = torch.tensor(latent['mean']).float()
+        if self.normalization is not None:
+            z = (z - self.mean) / self.std
+
+        pack = {
+            'x_0': z,
+        }
+        return pack
+    
+
+class TextConditionedSparseStructureLatent(TextConditionedMixin, SparseStructureLatent):
+    """
+    Text-conditioned sparse structure dataset
+    """
+    pass
+
+
+class ImageConditionedSparseStructureLatent(ImageConditionedMixin, SparseStructureLatent):
+    """
+    Image-conditioned sparse structure dataset
+    """
+    pass
+