From 653f4d304d1f03515d35416209215c9a94818758 Mon Sep 17 00:00:00 2001 From: zcr Date: Tue, 17 Mar 2026 11:30:16 +0800 Subject: [PATCH] 1 --- .../ss_flow_img_dit_L_16l8_fp16.json | 70 ++++++++++++++++++ .../ss_flow_txt_dit_B_16l8_fp16.json | 69 ++++++++++++++++++ .../ss_flow_txt_dit_L_16l8_fp16.json | 69 ++++++++++++++++++ .../ss_flow_txt_dit_XL_16l8_fp16.json | 70 ++++++++++++++++++ .../slat_vae_dec_mesh_swin8_B_64l8_fp16.json | 73 +++++++++++++++++++ configs/vae/ss_vae_conv3d_16l8_fp16.json | 65 +++++++++++++++++ 6 files changed, 416 insertions(+) create mode 100644 configs/generation/ss_flow_img_dit_L_16l8_fp16.json create mode 100644 configs/generation/ss_flow_txt_dit_B_16l8_fp16.json create mode 100644 configs/generation/ss_flow_txt_dit_L_16l8_fp16.json create mode 100644 configs/generation/ss_flow_txt_dit_XL_16l8_fp16.json create mode 100644 configs/vae/slat_vae_dec_mesh_swin8_B_64l8_fp16.json create mode 100644 configs/vae/ss_vae_conv3d_16l8_fp16.json diff --git a/configs/generation/ss_flow_img_dit_L_16l8_fp16.json b/configs/generation/ss_flow_img_dit_L_16l8_fp16.json new file mode 100644 index 0000000..8b6a4a7 --- /dev/null +++ b/configs/generation/ss_flow_img_dit_L_16l8_fp16.json @@ -0,0 +1,70 @@ +{ + "models": { + "denoiser": { + "name": "SparseStructureFlowModel", + "args": { + "resolution": 16, + "in_channels": 8, + "out_channels": 8, + "model_channels": 1024, + "cond_channels": 1024, + "num_blocks": 24, + "num_heads": 16, + "mlp_ratio": 4, + "patch_size": 1, + "pe_mode": "ape", + "qk_rms_norm": true, + "use_fp16": true + } + } + }, + "dataset": { + "name": "ImageConditionedSparseStructureLatent", + "args": { + "latent_model": "ss_enc_conv3d_16l8_fp16", + "min_aesthetic_score": 4.5, + "image_size": 518, + "pretrained_ss_dec": "microsoft/TRELLIS-image-large/ckpts/ss_dec_conv3d_16l8_fp16" + } + }, + "trainer": { + "name": "ImageConditionedFlowMatchingCFGTrainer", + "args": { + "max_steps": 1000000, + "batch_size_per_gpu": 8, + "batch_split": 1, + "optimizer": { + "name": "AdamW", + "args": { + "lr": 0.0001, + "weight_decay": 0.0 + } + }, + "ema_rate": [ + 0.9999 + ], + "fp16_mode": "inflat_all", + "fp16_scale_growth": 0.001, + "grad_clip": { + "name": "AdaptiveGradClipper", + "args": { + "max_norm": 1.0, + "clip_percentile": 95 + } + }, + "i_log": 500, + "i_sample": 10000, + "i_save": 10000, + "p_uncond": 0.1, + "t_schedule": { + "name": "logitNormal", + "args": { + "mean": 1.0, + "std": 1.0 + } + }, + "sigma_min": 1e-5, + "image_cond_model": "dinov2_vitl14_reg" + } + } +} \ No newline at end of file diff --git a/configs/generation/ss_flow_txt_dit_B_16l8_fp16.json b/configs/generation/ss_flow_txt_dit_B_16l8_fp16.json new file mode 100644 index 0000000..d57e45a --- /dev/null +++ b/configs/generation/ss_flow_txt_dit_B_16l8_fp16.json @@ -0,0 +1,69 @@ +{ + "models": { + "denoiser": { + "name": "SparseStructureFlowModel", + "args": { + "resolution": 16, + "in_channels": 8, + "out_channels": 8, + "model_channels": 768, + "cond_channels": 768, + "num_blocks": 12, + "num_heads": 12, + "mlp_ratio": 4, + "patch_size": 1, + "pe_mode": "ape", + "qk_rms_norm": true, + "use_fp16": true + } + } + }, + "dataset": { + "name": "TextConditionedSparseStructureLatent", + "args": { + "latent_model": "ss_enc_conv3d_16l8_fp16", + "min_aesthetic_score": 4.5, + "pretrained_ss_dec": "microsoft/TRELLIS-image-large/ckpts/ss_dec_conv3d_16l8_fp16" + } + }, + "trainer": { + "name": "TextConditionedFlowMatchingCFGTrainer", + "args": { + "max_steps": 1000000, + "batch_size_per_gpu": 16, + "batch_split": 1, + "optimizer": { + "name": "AdamW", + "args": { + "lr": 0.0001, + "weight_decay": 0.0 + } + }, + "ema_rate": [ + 0.9999 + ], + "fp16_mode": "inflat_all", + "fp16_scale_growth": 0.001, + "grad_clip": { + "name": "AdaptiveGradClipper", + "args": { + "max_norm": 1.0, + "clip_percentile": 95 + } + }, + "i_log": 500, + "i_sample": 10000, + "i_save": 10000, + "p_uncond": 0.1, + "t_schedule": { + "name": "logitNormal", + "args": { + "mean": 1.0, + "std": 1.0 + } + }, + "sigma_min": 1e-5, + "text_cond_model": "openai/clip-vit-large-patch14" + } + } +} \ No newline at end of file diff --git a/configs/generation/ss_flow_txt_dit_L_16l8_fp16.json b/configs/generation/ss_flow_txt_dit_L_16l8_fp16.json new file mode 100644 index 0000000..6fb5130 --- /dev/null +++ b/configs/generation/ss_flow_txt_dit_L_16l8_fp16.json @@ -0,0 +1,69 @@ +{ + "models": { + "denoiser": { + "name": "SparseStructureFlowModel", + "args": { + "resolution": 16, + "in_channels": 8, + "out_channels": 8, + "model_channels": 1024, + "cond_channels": 768, + "num_blocks": 24, + "num_heads": 16, + "mlp_ratio": 4, + "patch_size": 1, + "pe_mode": "ape", + "qk_rms_norm": true, + "use_fp16": true + } + } + }, + "dataset": { + "name": "TextConditionedSparseStructureLatent", + "args": { + "latent_model": "ss_enc_conv3d_16l8_fp16", + "min_aesthetic_score": 4.5, + "pretrained_ss_dec": "microsoft/TRELLIS-image-large/ckpts/ss_dec_conv3d_16l8_fp16" + } + }, + "trainer": { + "name": "TextConditionedFlowMatchingCFGTrainer", + "args": { + "max_steps": 1000000, + "batch_size_per_gpu": 8, + "batch_split": 1, + "optimizer": { + "name": "AdamW", + "args": { + "lr": 0.0001, + "weight_decay": 0.0 + } + }, + "ema_rate": [ + 0.9999 + ], + "fp16_mode": "inflat_all", + "fp16_scale_growth": 0.001, + "grad_clip": { + "name": "AdaptiveGradClipper", + "args": { + "max_norm": 1.0, + "clip_percentile": 95 + } + }, + "i_log": 500, + "i_sample": 10000, + "i_save": 10000, + "p_uncond": 0.1, + "t_schedule": { + "name": "logitNormal", + "args": { + "mean": 1.0, + "std": 1.0 + } + }, + "sigma_min": 1e-5, + "text_cond_model": "openai/clip-vit-large-patch14" + } + } +} \ No newline at end of file diff --git a/configs/generation/ss_flow_txt_dit_XL_16l8_fp16.json b/configs/generation/ss_flow_txt_dit_XL_16l8_fp16.json new file mode 100644 index 0000000..316d5e9 --- /dev/null +++ b/configs/generation/ss_flow_txt_dit_XL_16l8_fp16.json @@ -0,0 +1,70 @@ +{ + "models": { + "denoiser": { + "name": "SparseStructureFlowModel", + "args": { + "resolution": 16, + "in_channels": 8, + "out_channels": 8, + "model_channels": 1280, + "cond_channels": 768, + "num_blocks": 28, + "num_heads": 16, + "mlp_ratio": 4, + "patch_size": 1, + "pe_mode": "ape", + "qk_rms_norm": true, + "qk_rms_norm_cross": true, + "use_fp16": true + } + } + }, + "dataset": { + "name": "TextConditionedSparseStructureLatent", + "args": { + "latent_model": "ss_enc_conv3d_16l8_fp16", + "min_aesthetic_score": 4.5, + "pretrained_ss_dec": "microsoft/TRELLIS-image-large/ckpts/ss_dec_conv3d_16l8_fp16" + } + }, + "trainer": { + "name": "TextConditionedFlowMatchingCFGTrainer", + "args": { + "max_steps": 1000000, + "batch_size_per_gpu": 4, + "batch_split": 1, + "optimizer": { + "name": "AdamW", + "args": { + "lr": 0.0001, + "weight_decay": 0.0 + } + }, + "ema_rate": [ + 0.9999 + ], + "fp16_mode": "inflat_all", + "fp16_scale_growth": 0.001, + "grad_clip": { + "name": "AdaptiveGradClipper", + "args": { + "max_norm": 1.0, + "clip_percentile": 95 + } + }, + "i_log": 500, + "i_sample": 10000, + "i_save": 10000, + "p_uncond": 0.1, + "t_schedule": { + "name": "logitNormal", + "args": { + "mean": 1.0, + "std": 1.0 + } + }, + "sigma_min": 1e-5, + "text_cond_model": "openai/clip-vit-large-patch14" + } + } +} \ No newline at end of file diff --git a/configs/vae/slat_vae_dec_mesh_swin8_B_64l8_fp16.json b/configs/vae/slat_vae_dec_mesh_swin8_B_64l8_fp16.json new file mode 100644 index 0000000..c86b42c --- /dev/null +++ b/configs/vae/slat_vae_dec_mesh_swin8_B_64l8_fp16.json @@ -0,0 +1,73 @@ +{ + "models": { + "decoder": { + "name": "ElasticSLatMeshDecoder", + "args": { + "resolution": 64, + "model_channels": 768, + "latent_channels": 8, + "num_blocks": 12, + "num_heads": 12, + "mlp_ratio": 4, + "attn_mode": "swin", + "window_size": 8, + "use_fp16": true, + "representation_config": { + "use_color": true + } + } + } + }, + "dataset": { + "name": "Slat2RenderGeo", + "args": { + "image_size": 512, + "latent_model": "dinov2_vitl14_reg_slat_enc_swin8_B_64l8_fp16", + "min_aesthetic_score": 4.5, + "max_num_voxels": 32768 + } + }, + "trainer": { + "name": "SLatVaeMeshDecoderTrainer", + "args": { + "max_steps": 1000000, + "batch_size_per_gpu": 4, + "batch_split": 4, + "optimizer": { + "name": "AdamW", + "args": { + "lr": 1e-4, + "weight_decay": 0.0 + } + }, + "ema_rate": [ + 0.9999 + ], + "fp16_mode": "inflat_all", + "fp16_scale_growth": 0.001, + "elastic": { + "name": "LinearMemoryController", + "args": { + "target_ratio": 0.75, + "max_mem_ratio_start": 0.5 + } + }, + "grad_clip": { + "name": "AdaptiveGradClipper", + "args": { + "max_norm": 1.0, + "clip_percentile": 95 + } + }, + "i_log": 500, + "i_sample": 10000, + "i_save": 10000, + "lambda_ssim": 0.2, + "lambda_lpips": 0.2, + "lambda_tsdf": 0.01, + "lambda_depth": 10.0, + "lambda_color": 0.1, + "depth_loss_type": "smooth_l1" + } + } +} \ No newline at end of file diff --git a/configs/vae/ss_vae_conv3d_16l8_fp16.json b/configs/vae/ss_vae_conv3d_16l8_fp16.json new file mode 100644 index 0000000..3847cb8 --- /dev/null +++ b/configs/vae/ss_vae_conv3d_16l8_fp16.json @@ -0,0 +1,65 @@ +{ + "models": { + "encoder": { + "name": "SparseStructureEncoder", + "args": { + "in_channels": 1, + "latent_channels": 8, + "num_res_blocks": 2, + "num_res_blocks_middle": 2, + "channels": [32, 128, 512], + "use_fp16": true + } + }, + "decoder": { + "name": "SparseStructureDecoder", + "args": { + "out_channels": 1, + "latent_channels": 8, + "num_res_blocks": 2, + "num_res_blocks_middle": 2, + "channels": [512, 128, 32], + "use_fp16": true + } + } + }, + "dataset": { + "name": "SparseStructure", + "args": { + "resolution": 64, + "min_aesthetic_score": 4.5 + } + }, + "trainer": { + "name": "SparseStructureVaeTrainer", + "args": { + "max_steps": 1000000, + "batch_size_per_gpu": 4, + "batch_split": 1, + "optimizer": { + "name": "AdamW", + "args": { + "lr": 1e-4, + "weight_decay": 0.0 + } + }, + "ema_rate": [ + 0.9999 + ], + "fp16_mode": "inflat_all", + "fp16_scale_growth": 0.001, + "grad_clip": { + "name": "AdaptiveGradClipper", + "args": { + "max_norm": 1.0, + "clip_percentile": 95 + } + }, + "i_log": 500, + "i_sample": 10000, + "i_save": 10000, + "loss_type": "dice", + "lambda_kl": 0.001 + } + } +} \ No newline at end of file