This commit is contained in:
zcr
2026-03-17 11:30:16 +08:00
parent ca0c74e18b
commit 653f4d304d
6 changed files with 416 additions and 0 deletions

View File

@@ -0,0 +1,70 @@
{
"models": {
"denoiser": {
"name": "SparseStructureFlowModel",
"args": {
"resolution": 16,
"in_channels": 8,
"out_channels": 8,
"model_channels": 1024,
"cond_channels": 1024,
"num_blocks": 24,
"num_heads": 16,
"mlp_ratio": 4,
"patch_size": 1,
"pe_mode": "ape",
"qk_rms_norm": true,
"use_fp16": true
}
}
},
"dataset": {
"name": "ImageConditionedSparseStructureLatent",
"args": {
"latent_model": "ss_enc_conv3d_16l8_fp16",
"min_aesthetic_score": 4.5,
"image_size": 518,
"pretrained_ss_dec": "microsoft/TRELLIS-image-large/ckpts/ss_dec_conv3d_16l8_fp16"
}
},
"trainer": {
"name": "ImageConditionedFlowMatchingCFGTrainer",
"args": {
"max_steps": 1000000,
"batch_size_per_gpu": 8,
"batch_split": 1,
"optimizer": {
"name": "AdamW",
"args": {
"lr": 0.0001,
"weight_decay": 0.0
}
},
"ema_rate": [
0.9999
],
"fp16_mode": "inflat_all",
"fp16_scale_growth": 0.001,
"grad_clip": {
"name": "AdaptiveGradClipper",
"args": {
"max_norm": 1.0,
"clip_percentile": 95
}
},
"i_log": 500,
"i_sample": 10000,
"i_save": 10000,
"p_uncond": 0.1,
"t_schedule": {
"name": "logitNormal",
"args": {
"mean": 1.0,
"std": 1.0
}
},
"sigma_min": 1e-5,
"image_cond_model": "dinov2_vitl14_reg"
}
}
}

View File

@@ -0,0 +1,69 @@
{
"models": {
"denoiser": {
"name": "SparseStructureFlowModel",
"args": {
"resolution": 16,
"in_channels": 8,
"out_channels": 8,
"model_channels": 768,
"cond_channels": 768,
"num_blocks": 12,
"num_heads": 12,
"mlp_ratio": 4,
"patch_size": 1,
"pe_mode": "ape",
"qk_rms_norm": true,
"use_fp16": true
}
}
},
"dataset": {
"name": "TextConditionedSparseStructureLatent",
"args": {
"latent_model": "ss_enc_conv3d_16l8_fp16",
"min_aesthetic_score": 4.5,
"pretrained_ss_dec": "microsoft/TRELLIS-image-large/ckpts/ss_dec_conv3d_16l8_fp16"
}
},
"trainer": {
"name": "TextConditionedFlowMatchingCFGTrainer",
"args": {
"max_steps": 1000000,
"batch_size_per_gpu": 16,
"batch_split": 1,
"optimizer": {
"name": "AdamW",
"args": {
"lr": 0.0001,
"weight_decay": 0.0
}
},
"ema_rate": [
0.9999
],
"fp16_mode": "inflat_all",
"fp16_scale_growth": 0.001,
"grad_clip": {
"name": "AdaptiveGradClipper",
"args": {
"max_norm": 1.0,
"clip_percentile": 95
}
},
"i_log": 500,
"i_sample": 10000,
"i_save": 10000,
"p_uncond": 0.1,
"t_schedule": {
"name": "logitNormal",
"args": {
"mean": 1.0,
"std": 1.0
}
},
"sigma_min": 1e-5,
"text_cond_model": "openai/clip-vit-large-patch14"
}
}
}

View File

@@ -0,0 +1,69 @@
{
"models": {
"denoiser": {
"name": "SparseStructureFlowModel",
"args": {
"resolution": 16,
"in_channels": 8,
"out_channels": 8,
"model_channels": 1024,
"cond_channels": 768,
"num_blocks": 24,
"num_heads": 16,
"mlp_ratio": 4,
"patch_size": 1,
"pe_mode": "ape",
"qk_rms_norm": true,
"use_fp16": true
}
}
},
"dataset": {
"name": "TextConditionedSparseStructureLatent",
"args": {
"latent_model": "ss_enc_conv3d_16l8_fp16",
"min_aesthetic_score": 4.5,
"pretrained_ss_dec": "microsoft/TRELLIS-image-large/ckpts/ss_dec_conv3d_16l8_fp16"
}
},
"trainer": {
"name": "TextConditionedFlowMatchingCFGTrainer",
"args": {
"max_steps": 1000000,
"batch_size_per_gpu": 8,
"batch_split": 1,
"optimizer": {
"name": "AdamW",
"args": {
"lr": 0.0001,
"weight_decay": 0.0
}
},
"ema_rate": [
0.9999
],
"fp16_mode": "inflat_all",
"fp16_scale_growth": 0.001,
"grad_clip": {
"name": "AdaptiveGradClipper",
"args": {
"max_norm": 1.0,
"clip_percentile": 95
}
},
"i_log": 500,
"i_sample": 10000,
"i_save": 10000,
"p_uncond": 0.1,
"t_schedule": {
"name": "logitNormal",
"args": {
"mean": 1.0,
"std": 1.0
}
},
"sigma_min": 1e-5,
"text_cond_model": "openai/clip-vit-large-patch14"
}
}
}

View File

@@ -0,0 +1,70 @@
{
"models": {
"denoiser": {
"name": "SparseStructureFlowModel",
"args": {
"resolution": 16,
"in_channels": 8,
"out_channels": 8,
"model_channels": 1280,
"cond_channels": 768,
"num_blocks": 28,
"num_heads": 16,
"mlp_ratio": 4,
"patch_size": 1,
"pe_mode": "ape",
"qk_rms_norm": true,
"qk_rms_norm_cross": true,
"use_fp16": true
}
}
},
"dataset": {
"name": "TextConditionedSparseStructureLatent",
"args": {
"latent_model": "ss_enc_conv3d_16l8_fp16",
"min_aesthetic_score": 4.5,
"pretrained_ss_dec": "microsoft/TRELLIS-image-large/ckpts/ss_dec_conv3d_16l8_fp16"
}
},
"trainer": {
"name": "TextConditionedFlowMatchingCFGTrainer",
"args": {
"max_steps": 1000000,
"batch_size_per_gpu": 4,
"batch_split": 1,
"optimizer": {
"name": "AdamW",
"args": {
"lr": 0.0001,
"weight_decay": 0.0
}
},
"ema_rate": [
0.9999
],
"fp16_mode": "inflat_all",
"fp16_scale_growth": 0.001,
"grad_clip": {
"name": "AdaptiveGradClipper",
"args": {
"max_norm": 1.0,
"clip_percentile": 95
}
},
"i_log": 500,
"i_sample": 10000,
"i_save": 10000,
"p_uncond": 0.1,
"t_schedule": {
"name": "logitNormal",
"args": {
"mean": 1.0,
"std": 1.0
}
},
"sigma_min": 1e-5,
"text_cond_model": "openai/clip-vit-large-patch14"
}
}
}

View File

@@ -0,0 +1,73 @@
{
"models": {
"decoder": {
"name": "ElasticSLatMeshDecoder",
"args": {
"resolution": 64,
"model_channels": 768,
"latent_channels": 8,
"num_blocks": 12,
"num_heads": 12,
"mlp_ratio": 4,
"attn_mode": "swin",
"window_size": 8,
"use_fp16": true,
"representation_config": {
"use_color": true
}
}
}
},
"dataset": {
"name": "Slat2RenderGeo",
"args": {
"image_size": 512,
"latent_model": "dinov2_vitl14_reg_slat_enc_swin8_B_64l8_fp16",
"min_aesthetic_score": 4.5,
"max_num_voxels": 32768
}
},
"trainer": {
"name": "SLatVaeMeshDecoderTrainer",
"args": {
"max_steps": 1000000,
"batch_size_per_gpu": 4,
"batch_split": 4,
"optimizer": {
"name": "AdamW",
"args": {
"lr": 1e-4,
"weight_decay": 0.0
}
},
"ema_rate": [
0.9999
],
"fp16_mode": "inflat_all",
"fp16_scale_growth": 0.001,
"elastic": {
"name": "LinearMemoryController",
"args": {
"target_ratio": 0.75,
"max_mem_ratio_start": 0.5
}
},
"grad_clip": {
"name": "AdaptiveGradClipper",
"args": {
"max_norm": 1.0,
"clip_percentile": 95
}
},
"i_log": 500,
"i_sample": 10000,
"i_save": 10000,
"lambda_ssim": 0.2,
"lambda_lpips": 0.2,
"lambda_tsdf": 0.01,
"lambda_depth": 10.0,
"lambda_color": 0.1,
"depth_loss_type": "smooth_l1"
}
}
}

View File

@@ -0,0 +1,65 @@
{
"models": {
"encoder": {
"name": "SparseStructureEncoder",
"args": {
"in_channels": 1,
"latent_channels": 8,
"num_res_blocks": 2,
"num_res_blocks_middle": 2,
"channels": [32, 128, 512],
"use_fp16": true
}
},
"decoder": {
"name": "SparseStructureDecoder",
"args": {
"out_channels": 1,
"latent_channels": 8,
"num_res_blocks": 2,
"num_res_blocks_middle": 2,
"channels": [512, 128, 32],
"use_fp16": true
}
}
},
"dataset": {
"name": "SparseStructure",
"args": {
"resolution": 64,
"min_aesthetic_score": 4.5
}
},
"trainer": {
"name": "SparseStructureVaeTrainer",
"args": {
"max_steps": 1000000,
"batch_size_per_gpu": 4,
"batch_split": 1,
"optimizer": {
"name": "AdamW",
"args": {
"lr": 1e-4,
"weight_decay": 0.0
}
},
"ema_rate": [
0.9999
],
"fp16_mode": "inflat_all",
"fp16_scale_growth": 0.001,
"grad_clip": {
"name": "AdaptiveGradClipper",
"args": {
"max_norm": 1.0,
"clip_percentile": 95
}
},
"i_log": 500,
"i_sample": 10000,
"i_save": 10000,
"loss_type": "dice",
"lambda_kl": 0.001
}
}
}