This commit is contained in:
zcr
2026-03-17 11:29:47 +08:00
parent a6d9bac6d0
commit 06659057c3
26 changed files with 3742 additions and 0 deletions

View File

@@ -0,0 +1,92 @@
import os
import re
import argparse
import zipfile
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import pandas as pd
from utils import get_file_hash
def add_args(parser: argparse.ArgumentParser):
pass
def get_metadata(**kwargs):
metadata = pd.read_csv("hf://datasets/JeffreyXiang/TRELLIS-500K/Toys4k.csv")
return metadata
def download(metadata, output_dir, **kwargs):
os.makedirs(output_dir, exist_ok=True)
if not os.path.exists(os.path.join(output_dir, 'raw', 'toys4k_blend_files.zip')):
print("\033[93m")
print("Toys4k have to be downloaded manually")
print(f"Please download the toys4k_blend_files.zip file and place it in the {output_dir}/raw directory")
print("Visit https://github.com/rehg-lab/lowshot-shapebias/tree/main/toys4k for more information")
print("\033[0m")
raise FileNotFoundError("toys4k_blend_files.zip not found")
downloaded = {}
metadata = metadata.set_index("file_identifier")
with zipfile.ZipFile(os.path.join(output_dir, 'raw', 'toys4k_blend_files.zip')) as zip_ref:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor, \
tqdm(total=len(metadata), desc="Extracting") as pbar:
def worker(instance: str) -> str:
try:
zip_ref.extract(os.path.join('toys4k_blend_files', instance), os.path.join(output_dir, 'raw'))
sha256 = get_file_hash(os.path.join(output_dir, 'raw/toys4k_blend_files', instance))
pbar.update()
return sha256
except Exception as e:
pbar.update()
print(f"Error extracting for {instance}: {e}")
return None
sha256s = executor.map(worker, metadata.index)
executor.shutdown(wait=True)
for k, sha256 in zip(metadata.index, sha256s):
if sha256 is not None:
if sha256 == metadata.loc[k, "sha256"]:
downloaded[sha256] = os.path.join("raw/toys4k_blend_files", k)
else:
print(f"Error downloading {k}: sha256s do not match")
return pd.DataFrame(downloaded.items(), columns=['sha256', 'local_path'])
def foreach_instance(metadata, output_dir, func, max_workers=None, desc='Processing objects') -> pd.DataFrame:
import os
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
# load metadata
metadata = metadata.to_dict('records')
# processing objects
records = []
max_workers = max_workers or os.cpu_count()
try:
with ThreadPoolExecutor(max_workers=max_workers) as executor, \
tqdm(total=len(metadata), desc=desc) as pbar:
def worker(metadatum):
try:
local_path = metadatum['local_path']
sha256 = metadatum['sha256']
file = os.path.join(output_dir, local_path)
record = func(file, sha256)
if record is not None:
records.append(record)
pbar.update()
except Exception as e:
print(f"Error processing object {sha256}: {e}")
pbar.update()
executor.map(worker, metadata)
executor.shutdown(wait=True)
except:
print("Error happened during processing.")
return pd.DataFrame.from_records(records)

View File

@@ -0,0 +1 @@
pip install pillow imageio imageio-ffmpeg tqdm easydict opencv-python-headless pandas open3d objaverse huggingface_hub open_clip_torch

43
dataset_toolkits/utils.py Normal file
View File

@@ -0,0 +1,43 @@
from typing import *
import hashlib
import numpy as np
def get_file_hash(file: str) -> str:
sha256 = hashlib.sha256()
# Read the file from the path
with open(file, "rb") as f:
# Update the hash with the file content
for byte_block in iter(lambda: f.read(4096), b""):
sha256.update(byte_block)
return sha256.hexdigest()
# ===============LOW DISCREPANCY SEQUENCES================
PRIMES = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53]
def radical_inverse(base, n):
val = 0
inv_base = 1.0 / base
inv_base_n = inv_base
while n > 0:
digit = n % base
val += digit * inv_base_n
n //= base
inv_base_n *= inv_base
return val
def halton_sequence(dim, n):
return [radical_inverse(PRIMES[dim], n) for dim in range(dim)]
def hammersley_sequence(dim, n, num_samples):
return [n / num_samples] + halton_sequence(dim - 1, n)
def sphere_hammersley_sequence(n, num_samples, offset=(0, 0)):
u, v = hammersley_sequence(2, n, num_samples)
u += offset[0] / num_samples
v += offset[1]
u = 2 * u if u < 0.25 else 2 / 3 * u + 1 / 3
theta = np.arccos(1 - 2 * u) - np.pi / 2
phi = v * 2 * np.pi
return [phi, theta]

View File

@@ -0,0 +1,86 @@
import os
import copy
import sys
import importlib
import argparse
import pandas as pd
from easydict import EasyDict as edict
from functools import partial
import numpy as np
import open3d as o3d
import utils3d
def _voxelize(file, sha256, output_dir):
mesh = o3d.io.read_triangle_mesh(os.path.join(output_dir, 'renders', sha256, 'mesh.ply'))
# clamp vertices to the range [-0.5, 0.5]
vertices = np.clip(np.asarray(mesh.vertices), -0.5 + 1e-6, 0.5 - 1e-6)
mesh.vertices = o3d.utility.Vector3dVector(vertices)
voxel_grid = o3d.geometry.VoxelGrid.create_from_triangle_mesh_within_bounds(mesh, voxel_size=1/64, min_bound=(-0.5, -0.5, -0.5), max_bound=(0.5, 0.5, 0.5))
vertices = np.array([voxel.grid_index for voxel in voxel_grid.get_voxels()])
assert np.all(vertices >= 0) and np.all(vertices < 64), "Some vertices are out of bounds"
vertices = (vertices + 0.5) / 64 - 0.5
utils3d.io.write_ply(os.path.join(output_dir, 'voxels', f'{sha256}.ply'), vertices)
return {'sha256': sha256, 'voxelized': True, 'num_voxels': len(vertices)}
if __name__ == '__main__':
dataset_utils = importlib.import_module(f'datasets.{sys.argv[1]}')
parser = argparse.ArgumentParser()
parser.add_argument('--output_dir', type=str, required=True,
help='Directory to save the metadata')
parser.add_argument('--filter_low_aesthetic_score', type=float, default=None,
help='Filter objects with aesthetic score lower than this value')
parser.add_argument('--instances', type=str, default=None,
help='Instances to process')
parser.add_argument('--num_views', type=int, default=150,
help='Number of views to render')
dataset_utils.add_args(parser)
parser.add_argument('--rank', type=int, default=0)
parser.add_argument('--world_size', type=int, default=1)
parser.add_argument('--max_workers', type=int, default=None)
opt = parser.parse_args(sys.argv[2:])
opt = edict(vars(opt))
os.makedirs(os.path.join(opt.output_dir, 'voxels'), exist_ok=True)
# get file list
if not os.path.exists(os.path.join(opt.output_dir, 'metadata.csv')):
raise ValueError('metadata.csv not found')
metadata = pd.read_csv(os.path.join(opt.output_dir, 'metadata.csv'))
if opt.instances is None:
if opt.filter_low_aesthetic_score is not None:
metadata = metadata[metadata['aesthetic_score'] >= opt.filter_low_aesthetic_score]
if 'rendered' not in metadata.columns:
raise ValueError('metadata.csv does not have "rendered" column, please run "build_metadata.py" first')
metadata = metadata[metadata['rendered'] == True]
if 'voxelized' in metadata.columns:
metadata = metadata[metadata['voxelized'] == False]
else:
if os.path.exists(opt.instances):
with open(opt.instances, 'r') as f:
instances = f.read().splitlines()
else:
instances = opt.instances.split(',')
metadata = metadata[metadata['sha256'].isin(instances)]
start = len(metadata) * opt.rank // opt.world_size
end = len(metadata) * (opt.rank + 1) // opt.world_size
metadata = metadata[start:end]
records = []
# filter out objects that are already processed
for sha256 in copy.copy(metadata['sha256'].values):
if os.path.exists(os.path.join(opt.output_dir, 'voxels', f'{sha256}.ply')):
pts = utils3d.io.read_ply(os.path.join(opt.output_dir, 'voxels', f'{sha256}.ply'))[0]
records.append({'sha256': sha256, 'voxelized': True, 'num_voxels': len(pts)})
metadata = metadata[metadata['sha256'] != sha256]
print(f'Processing {len(metadata)} objects...')
# process objects
func = partial(_voxelize, output_dir=opt.output_dir)
voxelized = dataset_utils.foreach_instance(metadata, opt.output_dir, func, max_workers=opt.max_workers, desc='Voxelizing')
voxelized = pd.concat([voxelized, pd.DataFrame.from_records(records)])
voxelized.to_csv(os.path.join(opt.output_dir, f'voxelized_{opt.rank}.csv'), index=False)