1
This commit is contained in:
92
dataset_toolkits/datasets/Toys4k.py
Normal file
92
dataset_toolkits/datasets/Toys4k.py
Normal file
@@ -0,0 +1,92 @@
|
||||
import os
|
||||
import re
|
||||
import argparse
|
||||
import zipfile
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
from utils import get_file_hash
|
||||
|
||||
|
||||
def add_args(parser: argparse.ArgumentParser):
|
||||
pass
|
||||
|
||||
|
||||
def get_metadata(**kwargs):
|
||||
metadata = pd.read_csv("hf://datasets/JeffreyXiang/TRELLIS-500K/Toys4k.csv")
|
||||
return metadata
|
||||
|
||||
|
||||
def download(metadata, output_dir, **kwargs):
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
if not os.path.exists(os.path.join(output_dir, 'raw', 'toys4k_blend_files.zip')):
|
||||
print("\033[93m")
|
||||
print("Toys4k have to be downloaded manually")
|
||||
print(f"Please download the toys4k_blend_files.zip file and place it in the {output_dir}/raw directory")
|
||||
print("Visit https://github.com/rehg-lab/lowshot-shapebias/tree/main/toys4k for more information")
|
||||
print("\033[0m")
|
||||
raise FileNotFoundError("toys4k_blend_files.zip not found")
|
||||
|
||||
downloaded = {}
|
||||
metadata = metadata.set_index("file_identifier")
|
||||
with zipfile.ZipFile(os.path.join(output_dir, 'raw', 'toys4k_blend_files.zip')) as zip_ref:
|
||||
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor, \
|
||||
tqdm(total=len(metadata), desc="Extracting") as pbar:
|
||||
def worker(instance: str) -> str:
|
||||
try:
|
||||
zip_ref.extract(os.path.join('toys4k_blend_files', instance), os.path.join(output_dir, 'raw'))
|
||||
sha256 = get_file_hash(os.path.join(output_dir, 'raw/toys4k_blend_files', instance))
|
||||
pbar.update()
|
||||
return sha256
|
||||
except Exception as e:
|
||||
pbar.update()
|
||||
print(f"Error extracting for {instance}: {e}")
|
||||
return None
|
||||
|
||||
sha256s = executor.map(worker, metadata.index)
|
||||
executor.shutdown(wait=True)
|
||||
|
||||
for k, sha256 in zip(metadata.index, sha256s):
|
||||
if sha256 is not None:
|
||||
if sha256 == metadata.loc[k, "sha256"]:
|
||||
downloaded[sha256] = os.path.join("raw/toys4k_blend_files", k)
|
||||
else:
|
||||
print(f"Error downloading {k}: sha256s do not match")
|
||||
|
||||
return pd.DataFrame(downloaded.items(), columns=['sha256', 'local_path'])
|
||||
|
||||
|
||||
def foreach_instance(metadata, output_dir, func, max_workers=None, desc='Processing objects') -> pd.DataFrame:
|
||||
import os
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from tqdm import tqdm
|
||||
|
||||
# load metadata
|
||||
metadata = metadata.to_dict('records')
|
||||
|
||||
# processing objects
|
||||
records = []
|
||||
max_workers = max_workers or os.cpu_count()
|
||||
try:
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor, \
|
||||
tqdm(total=len(metadata), desc=desc) as pbar:
|
||||
def worker(metadatum):
|
||||
try:
|
||||
local_path = metadatum['local_path']
|
||||
sha256 = metadatum['sha256']
|
||||
file = os.path.join(output_dir, local_path)
|
||||
record = func(file, sha256)
|
||||
if record is not None:
|
||||
records.append(record)
|
||||
pbar.update()
|
||||
except Exception as e:
|
||||
print(f"Error processing object {sha256}: {e}")
|
||||
pbar.update()
|
||||
|
||||
executor.map(worker, metadata)
|
||||
executor.shutdown(wait=True)
|
||||
except:
|
||||
print("Error happened during processing.")
|
||||
|
||||
return pd.DataFrame.from_records(records)
|
||||
1
dataset_toolkits/setup.sh
Normal file
1
dataset_toolkits/setup.sh
Normal file
@@ -0,0 +1 @@
|
||||
pip install pillow imageio imageio-ffmpeg tqdm easydict opencv-python-headless pandas open3d objaverse huggingface_hub open_clip_torch
|
||||
43
dataset_toolkits/utils.py
Normal file
43
dataset_toolkits/utils.py
Normal file
@@ -0,0 +1,43 @@
|
||||
from typing import *
|
||||
import hashlib
|
||||
import numpy as np
|
||||
|
||||
|
||||
def get_file_hash(file: str) -> str:
|
||||
sha256 = hashlib.sha256()
|
||||
# Read the file from the path
|
||||
with open(file, "rb") as f:
|
||||
# Update the hash with the file content
|
||||
for byte_block in iter(lambda: f.read(4096), b""):
|
||||
sha256.update(byte_block)
|
||||
return sha256.hexdigest()
|
||||
|
||||
# ===============LOW DISCREPANCY SEQUENCES================
|
||||
|
||||
PRIMES = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53]
|
||||
|
||||
def radical_inverse(base, n):
|
||||
val = 0
|
||||
inv_base = 1.0 / base
|
||||
inv_base_n = inv_base
|
||||
while n > 0:
|
||||
digit = n % base
|
||||
val += digit * inv_base_n
|
||||
n //= base
|
||||
inv_base_n *= inv_base
|
||||
return val
|
||||
|
||||
def halton_sequence(dim, n):
|
||||
return [radical_inverse(PRIMES[dim], n) for dim in range(dim)]
|
||||
|
||||
def hammersley_sequence(dim, n, num_samples):
|
||||
return [n / num_samples] + halton_sequence(dim - 1, n)
|
||||
|
||||
def sphere_hammersley_sequence(n, num_samples, offset=(0, 0)):
|
||||
u, v = hammersley_sequence(2, n, num_samples)
|
||||
u += offset[0] / num_samples
|
||||
v += offset[1]
|
||||
u = 2 * u if u < 0.25 else 2 / 3 * u + 1 / 3
|
||||
theta = np.arccos(1 - 2 * u) - np.pi / 2
|
||||
phi = v * 2 * np.pi
|
||||
return [phi, theta]
|
||||
86
dataset_toolkits/voxelize.py
Normal file
86
dataset_toolkits/voxelize.py
Normal file
@@ -0,0 +1,86 @@
|
||||
import os
|
||||
import copy
|
||||
import sys
|
||||
import importlib
|
||||
import argparse
|
||||
import pandas as pd
|
||||
from easydict import EasyDict as edict
|
||||
from functools import partial
|
||||
import numpy as np
|
||||
import open3d as o3d
|
||||
import utils3d
|
||||
|
||||
|
||||
def _voxelize(file, sha256, output_dir):
|
||||
mesh = o3d.io.read_triangle_mesh(os.path.join(output_dir, 'renders', sha256, 'mesh.ply'))
|
||||
# clamp vertices to the range [-0.5, 0.5]
|
||||
vertices = np.clip(np.asarray(mesh.vertices), -0.5 + 1e-6, 0.5 - 1e-6)
|
||||
mesh.vertices = o3d.utility.Vector3dVector(vertices)
|
||||
voxel_grid = o3d.geometry.VoxelGrid.create_from_triangle_mesh_within_bounds(mesh, voxel_size=1/64, min_bound=(-0.5, -0.5, -0.5), max_bound=(0.5, 0.5, 0.5))
|
||||
vertices = np.array([voxel.grid_index for voxel in voxel_grid.get_voxels()])
|
||||
assert np.all(vertices >= 0) and np.all(vertices < 64), "Some vertices are out of bounds"
|
||||
vertices = (vertices + 0.5) / 64 - 0.5
|
||||
utils3d.io.write_ply(os.path.join(output_dir, 'voxels', f'{sha256}.ply'), vertices)
|
||||
return {'sha256': sha256, 'voxelized': True, 'num_voxels': len(vertices)}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
dataset_utils = importlib.import_module(f'datasets.{sys.argv[1]}')
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--output_dir', type=str, required=True,
|
||||
help='Directory to save the metadata')
|
||||
parser.add_argument('--filter_low_aesthetic_score', type=float, default=None,
|
||||
help='Filter objects with aesthetic score lower than this value')
|
||||
parser.add_argument('--instances', type=str, default=None,
|
||||
help='Instances to process')
|
||||
parser.add_argument('--num_views', type=int, default=150,
|
||||
help='Number of views to render')
|
||||
dataset_utils.add_args(parser)
|
||||
parser.add_argument('--rank', type=int, default=0)
|
||||
parser.add_argument('--world_size', type=int, default=1)
|
||||
parser.add_argument('--max_workers', type=int, default=None)
|
||||
opt = parser.parse_args(sys.argv[2:])
|
||||
opt = edict(vars(opt))
|
||||
|
||||
os.makedirs(os.path.join(opt.output_dir, 'voxels'), exist_ok=True)
|
||||
|
||||
# get file list
|
||||
if not os.path.exists(os.path.join(opt.output_dir, 'metadata.csv')):
|
||||
raise ValueError('metadata.csv not found')
|
||||
metadata = pd.read_csv(os.path.join(opt.output_dir, 'metadata.csv'))
|
||||
if opt.instances is None:
|
||||
if opt.filter_low_aesthetic_score is not None:
|
||||
metadata = metadata[metadata['aesthetic_score'] >= opt.filter_low_aesthetic_score]
|
||||
if 'rendered' not in metadata.columns:
|
||||
raise ValueError('metadata.csv does not have "rendered" column, please run "build_metadata.py" first')
|
||||
metadata = metadata[metadata['rendered'] == True]
|
||||
if 'voxelized' in metadata.columns:
|
||||
metadata = metadata[metadata['voxelized'] == False]
|
||||
else:
|
||||
if os.path.exists(opt.instances):
|
||||
with open(opt.instances, 'r') as f:
|
||||
instances = f.read().splitlines()
|
||||
else:
|
||||
instances = opt.instances.split(',')
|
||||
metadata = metadata[metadata['sha256'].isin(instances)]
|
||||
|
||||
start = len(metadata) * opt.rank // opt.world_size
|
||||
end = len(metadata) * (opt.rank + 1) // opt.world_size
|
||||
metadata = metadata[start:end]
|
||||
records = []
|
||||
|
||||
# filter out objects that are already processed
|
||||
for sha256 in copy.copy(metadata['sha256'].values):
|
||||
if os.path.exists(os.path.join(opt.output_dir, 'voxels', f'{sha256}.ply')):
|
||||
pts = utils3d.io.read_ply(os.path.join(opt.output_dir, 'voxels', f'{sha256}.ply'))[0]
|
||||
records.append({'sha256': sha256, 'voxelized': True, 'num_voxels': len(pts)})
|
||||
metadata = metadata[metadata['sha256'] != sha256]
|
||||
|
||||
print(f'Processing {len(metadata)} objects...')
|
||||
|
||||
# process objects
|
||||
func = partial(_voxelize, output_dir=opt.output_dir)
|
||||
voxelized = dataset_utils.foreach_instance(metadata, opt.output_dir, func, max_workers=opt.max_workers, desc='Voxelizing')
|
||||
voxelized = pd.concat([voxelized, pd.DataFrame.from_records(records)])
|
||||
voxelized.to_csv(os.path.join(opt.output_dir, f'voxelized_{opt.rank}.csv'), index=False)
|
||||
Reference in New Issue
Block a user