1

2026-03-17 11:29:47 +08:00
parent a6d9bac6d0
commit 06659057c3
26 changed files with 3742 additions and 0 deletions
--- a/configs/vae/slat_vae_dec_rf_swin8_B_64l8_fp16.json
+++ b/configs/vae/slat_vae_dec_rf_swin8_B_64l8_fp16.json
@@ -0,0 +1,71 @@
+{
+    "models": {
+        "decoder": {
+            "name": "ElasticSLatRadianceFieldDecoder",
+            "args": {
+                "resolution": 64,
+                "model_channels": 768,
+                "latent_channels": 8,
+                "num_blocks": 12,
+                "num_heads": 12,
+                "mlp_ratio": 4,
+                "attn_mode": "swin",
+                "window_size": 8,
+                "use_fp16": true,
+                "representation_config": {
+                    "rank": 16,
+                    "dim": 8
+                }
+            }
+        }
+    },
+    "dataset": {
+        "name": "SLat2Render",
+        "args": {
+            "image_size": 512,
+            "latent_model": "dinov2_vitl14_reg_slat_enc_swin8_B_64l8_fp16",
+            "min_aesthetic_score": 4.5,
+            "max_num_voxels": 32768
+        }
+    },
+    "trainer": {
+        "name": "SLatVaeRadianceFieldDecoderTrainer",
+        "args": {
+            "max_steps": 1000000,
+            "batch_size_per_gpu": 4,
+            "batch_split": 2,
+            "optimizer": {
+                "name": "AdamW",
+                "args": {
+                    "lr": 1e-4,
+                    "weight_decay": 0.0
+                }
+            },
+            "ema_rate": [
+                0.9999
+            ],
+            "fp16_mode": "inflat_all",
+            "fp16_scale_growth": 0.001,
+            "elastic": {
+                "name": "LinearMemoryController",
+                "args": {
+                    "target_ratio": 0.75,
+                    "max_mem_ratio_start": 0.5
+                }
+            },
+            "grad_clip": {
+                "name": "AdaptiveGradClipper",
+                "args": {
+                    "max_norm": 1.0,
+                    "clip_percentile": 95
+                }
+            },
+            "i_log": 500,
+            "i_sample": 10000,
+            "i_save": 10000,
+            "loss_type": "l1",
+            "lambda_ssim": 0.2,
+            "lambda_lpips": 0.2
+        }
+    }
+}
--- a/configs/vae/slat_vae_enc_dec_gs_swin8_B_64l8_fp16.json
+++ b/configs/vae/slat_vae_enc_dec_gs_swin8_B_64l8_fp16.json
@@ -0,0 +1,105 @@
+{
+    "models": {
+        "encoder": {
+            "name": "ElasticSLatEncoder",
+            "args": {
+                "resolution": 64,
+                "in_channels": 1024,
+                "model_channels": 768,
+                "latent_channels": 8,
+                "num_blocks": 12,
+                "num_heads": 12,
+                "mlp_ratio": 4,
+                "attn_mode": "swin",
+                "window_size": 8,
+                "use_fp16": true
+            }
+        },
+        "decoder": {
+            "name": "ElasticSLatGaussianDecoder",
+            "args": {
+                "resolution": 64,
+                "model_channels": 768,
+                "latent_channels": 8,
+                "num_blocks": 12,
+                "num_heads": 12,
+                "mlp_ratio": 4,
+                "attn_mode": "swin",
+                "window_size": 8,
+                "use_fp16": true,
+                "representation_config": {
+                    "lr": {
+                        "_xyz": 1.0,
+                        "_features_dc": 1.0,
+                        "_opacity": 1.0,
+                        "_scaling": 1.0,
+                        "_rotation": 0.1
+                    },
+                    "perturb_offset": true,
+                    "voxel_size": 1.5,
+                    "num_gaussians": 32,
+                    "2d_filter_kernel_size": 0.1,
+                    "3d_filter_kernel_size": 9e-4,
+                    "scaling_bias": 4e-3,
+                    "opacity_bias": 0.1,
+                    "scaling_activation": "softplus"
+                }
+            }
+        }
+    },
+    "dataset": {
+        "name": "SparseFeat2Render",
+        "args": {
+            "image_size": 512,
+            "model": "dinov2_vitl14_reg",
+            "resolution": 64,
+            "min_aesthetic_score": 4.5,
+            "max_num_voxels": 32768
+        }
+    },
+    "trainer": {
+        "name": "SLatVaeGaussianTrainer",
+        "args": {
+            "max_steps": 1000000,
+            "batch_size_per_gpu": 4,
+            "batch_split": 2,
+            "optimizer": {
+                "name": "AdamW",
+                "args": {
+                    "lr": 1e-4,
+                    "weight_decay": 0.0
+                }
+            },
+            "ema_rate": [
+                0.9999
+            ],
+            "fp16_mode": "inflat_all",
+            "fp16_scale_growth": 0.001,
+            "elastic": {
+                "name": "LinearMemoryController",
+                "args": {
+                    "target_ratio": 0.75,
+                    "max_mem_ratio_start": 0.5
+                }
+            },
+            "grad_clip": {
+                "name": "AdaptiveGradClipper",
+                "args": {
+                    "max_norm": 1.0,
+                    "clip_percentile": 95
+                }
+            },
+            "i_log": 500,
+            "i_sample": 10000,
+            "i_save": 10000,
+            "loss_type": "l1",
+            "lambda_ssim": 0.2,
+            "lambda_lpips": 0.2,
+            "lambda_kl": 1e-06,
+            "regularizations": {
+                "lambda_vol": 10000.0,
+                "lambda_opacity": 0.001
+            }
+        }
+    }
+}