You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

392 lines
15 KiB
Python

4 years ago
import torch
import torch.nn as nn
import torch.optim
import torch.distributed
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.multiprocessing
import numpy as np
import os
from collections import OrderedDict
from ddp_model import NerfNet
import time
from data_loader_split import load_data_split
from utils import mse2psnr, img_HWC2CHW, colorize, colorize_np, TINY_NUMBER, to8b
import imageio
from ddp_run_nerf import config_parser
import logging
logger = logging.getLogger(__package__)
def setup_logger():
# create logger
logger = logging.getLogger(__package__)
logger.setLevel(logging.DEBUG)
# create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
# create formatter
formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(name)s: %(message)s')
# add formatter to ch
ch.setFormatter(formatter)
# add ch to logger
logger.addHandler(ch)
def intersect_sphere(ray_o, ray_d):
'''
ray_o, ray_d: [..., 3]
compute the depth of the intersection point between this ray and unit sphere
'''
# note: d1 becomes negative if this mid point is behind camera
d1 = -torch.sum(ray_d * ray_o, dim=-1) / torch.sum(ray_d * ray_d, dim=-1)
p = ray_o + d1.unsqueeze(-1) * ray_d
# consider the case where the ray does not intersect the sphere
ray_d_cos = 1. / torch.norm(ray_d, dim=-1)
d2 = torch.sqrt(1. - torch.sum(p * p, dim=-1)) * ray_d_cos
return d1 + d2
def perturb_samples(z_vals):
# get intervals between samples
mids = .5 * (z_vals[..., 1:] + z_vals[..., :-1])
upper = torch.cat([mids, z_vals[..., -1:]], dim=-1)
lower = torch.cat([z_vals[..., 0:1], mids], dim=-1)
# uniform samples in those intervals
t_rand = torch.rand_like(z_vals)
z_vals = lower + (upper - lower) * t_rand # [N_rays, N_samples]
return z_vals
def sample_pdf(bins, weights, N_samples, det=False):
'''
:param bins: tensor of shape [..., M+1], M is the number of bins
:param weights: tensor of shape [..., M]
:param N_samples: number of samples along each ray
:param det: if True, will perform deterministic sampling
:return: [..., N_samples]
'''
# Get pdf
weights = weights + TINY_NUMBER # prevent nans
pdf = weights / torch.sum(weights, dim=-1, keepdim=True) # [..., M]
cdf = torch.cumsum(pdf, dim=-1) # [..., M]
cdf = torch.cat([torch.zeros_like(cdf[..., 0:1]), cdf], dim=-1) # [..., M+1]
# Take uniform samples
dots_sh = list(weights.shape[:-1])
M = weights.shape[-1]
min_cdf = 0.00
max_cdf = 1.00 # prevent outlier samples
if det:
u = torch.linspace(min_cdf, max_cdf, N_samples, device=bins.device)
u = u.view([1]*len(dots_sh) + [N_samples]).expand(dots_sh + [N_samples,]) # [..., N_samples]
else:
sh = dots_sh + [N_samples]
u = torch.rand(*sh, device=bins.device) * (max_cdf - min_cdf) + min_cdf # [..., N_samples]
# Invert CDF
# [..., N_samples, 1] >= [..., 1, M] ----> [..., N_samples, M] ----> [..., N_samples,]
above_inds = torch.sum(u.unsqueeze(-1) >= cdf[..., :M].unsqueeze(-2), dim=-1).long()
# random sample inside each bin
below_inds = torch.clamp(above_inds-1, min=0)
inds_g = torch.stack((below_inds, above_inds), dim=-1) # [..., N_samples, 2]
cdf = cdf.unsqueeze(-2).expand(dots_sh + [N_samples, M+1]) # [..., N_samples, M+1]
cdf_g = torch.gather(input=cdf, dim=-1, index=inds_g) # [..., N_samples, 2]
bins = bins.unsqueeze(-2).expand(dots_sh + [N_samples, M+1]) # [..., N_samples, M+1]
bins_g = torch.gather(input=bins, dim=-1, index=inds_g) # [..., N_samples, 2]
# fix numeric issue
denom = cdf_g[..., 1] - cdf_g[..., 0] # [..., N_samples]
denom = torch.where(denom<TINY_NUMBER, torch.ones_like(denom), denom)
t = (u - cdf_g[..., 0]) / denom
samples = bins_g[..., 0] + t * (bins_g[..., 1] - bins_g[..., 0] + TINY_NUMBER)
return samples
def render_single_image(rank, world_size, models, ray_sampler, chunk_size):
##### parallel rendering of a single image
ray_batch = ray_sampler.get_all()
# split into ranks; make sure different processes don't overlap
rank_split_sizes = [ray_batch['ray_d'].shape[0] // world_size, ] * world_size
rank_split_sizes[-1] = ray_batch['ray_d'].shape[0] - sum(rank_split_sizes[:-1])
for key in ray_batch:
if torch.is_tensor(ray_batch[key]):
ray_batch[key] = torch.split(ray_batch[key], rank_split_sizes)[rank].to(rank)
# split into chunks and render inside each process
ray_batch_split = OrderedDict()
for key in ray_batch:
if torch.is_tensor(ray_batch[key]):
ray_batch_split[key] = torch.split(ray_batch[key], chunk_size)
# forward and backward
ret_merge_chunk = [OrderedDict() for _ in range(models['cascade_level'])]
for s in range(len(ray_batch_split['ray_d'])):
ray_o = ray_batch_split['ray_o'][s]
ray_d = ray_batch_split['ray_d'][s]
min_depth = ray_batch_split['min_depth'][s]
dots_sh = list(ray_d.shape[:-1])
for m in range(models['cascade_level']):
net = models['net_{}'.format(m)]
# sample depths
N_samples = models['cascade_samples'][m]
if m == 0:
# foreground depth
fg_far_depth = intersect_sphere(ray_o, ray_d) # [...,]
# fg_near_depth = 0.18 * torch.ones_like(fg_far_depth)
fg_near_depth = min_depth # [..., 3]
step = (fg_far_depth - fg_near_depth) / (N_samples - 1)
fg_depth = torch.stack([fg_near_depth + i * step for i in range(N_samples)], dim=-1) # [..., N_samples]
# background depth
bg_depth = torch.linspace(0., 1., N_samples).view(
[1, ] * len(dots_sh) + [N_samples,]).expand(dots_sh + [N_samples,]).to(rank)
# delete unused memory
del fg_near_depth
del step
torch.cuda.empty_cache()
else:
# sample pdf and concat with earlier samples
fg_weights = ret['fg_weights'].clone().detach()
fg_depth_mid = .5 * (fg_depth[..., 1:] + fg_depth[..., :-1]) # [..., N_samples-1]
fg_weights = fg_weights[..., 1:-1] # [..., N_samples-2]
fg_depth_samples = sample_pdf(bins=fg_depth_mid, weights=fg_weights,
N_samples=N_samples, det=True) # [..., N_samples]
fg_depth, _ = torch.sort(torch.cat((fg_depth, fg_depth_samples), dim=-1))
# sample pdf and concat with earlier samples
bg_weights = ret['bg_weights'].clone().detach()
bg_depth_mid = .5 * (bg_depth[..., 1:] + bg_depth[..., :-1])
bg_weights = bg_weights[..., 1:-1] # [..., N_samples-2]
bg_depth_samples = sample_pdf(bins=bg_depth_mid, weights=bg_weights,
N_samples=N_samples, det=True) # [..., N_samples]
bg_depth, _ = torch.sort(torch.cat((bg_depth, bg_depth_samples), dim=-1))
# delete unused memory
del fg_weights
del fg_depth_mid
del fg_depth_samples
del bg_weights
del bg_depth_mid
del bg_depth_samples
torch.cuda.empty_cache()
with torch.no_grad():
ret = net(ray_o, ray_d, fg_far_depth, fg_depth, bg_depth)
for key in ret:
if key not in ['fg_weights', 'bg_weights']:
if torch.is_tensor(ret[key]):
if key not in ret_merge_chunk[m]:
ret_merge_chunk[m][key] = [ret[key].cpu(), ]
else:
ret_merge_chunk[m][key].append(ret[key].cpu())
ret[key] = None
# clean unused memory
torch.cuda.empty_cache()
# merge results from different chunks
for m in range(len(ret_merge_chunk)):
for key in ret_merge_chunk[m]:
ret_merge_chunk[m][key] = torch.cat(ret_merge_chunk[m][key], dim=0)
# merge results from different processes
if rank == 0:
ret_merge_rank = [OrderedDict() for _ in range(len(ret_merge_chunk))]
for m in range(len(ret_merge_chunk)):
for key in ret_merge_chunk[m]:
# generate tensors to store results from other processes
sh = list(ret_merge_chunk[m][key].shape[1:])
ret_merge_rank[m][key] = [torch.zeros(*[size,]+sh, dtype=torch.float32) for size in rank_split_sizes]
torch.distributed.gather(ret_merge_chunk[m][key], ret_merge_rank[m][key])
ret_merge_rank[m][key] = torch.cat(ret_merge_rank[m][key], dim=0).reshape(
(ray_sampler.H, ray_sampler.W, -1)).squeeze()
# print(m, key, ret_merge_rank[m][key].shape)
else: # send results to main process
for m in range(len(ret_merge_chunk)):
for key in ret_merge_chunk[m]:
torch.distributed.gather(ret_merge_chunk[m][key])
# only rank 0 program returns
if rank == 0:
return ret_merge_rank
else:
return None
def setup(rank, world_size):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
# initialize the process group
torch.distributed.init_process_group("gloo", rank=rank, world_size=world_size)
def cleanup():
torch.distributed.destroy_process_group()
def ddp_test_nerf(rank, args):
###### set up multi-processing
setup(rank, args.world_size)
###### set up logger
logger = logging.getLogger(__package__)
setup_logger()
###### decide chunk size according to gpu memory
if torch.cuda.get_device_properties(rank).total_memory / 1e9 > 14:
logger.info('setting batch size according to 24G gpu')
args.N_rand = 1024
args.chunk_size = 8192
else:
logger.info('setting batch size according to 12G gpu')
args.N_rand = 512
args.chunk_size = 4096
###### create network and wrap in ddp; each process should do this
# fix random seed just to make sure the network is initialized with same weights at different processes
torch.manual_seed(777)
# very important!!! otherwise it might introduce extra memory in rank=0 gpu
torch.cuda.set_device(rank)
models = OrderedDict()
models['cascade_level'] = args.cascade_level
models['cascade_samples'] = [int(x.strip()) for x in args.cascade_samples.split(',')]
for m in range(models['cascade_level']):
net = NerfNet(args).to(rank)
net = DDP(net, device_ids=[rank], output_device=rank)
optim = torch.optim.Adam(net.parameters(), lr=args.lrate)
models['net_{}'.format(m)] = net
models['optim_{}'.format(m)] = optim
start = -1
###### load pretrained weights; each process should do this
if (args.ckpt_path is not None) and (os.path.isfile(args.ckpt_path)):
ckpts = [args.ckpt_path]
else:
ckpts = [os.path.join(args.basedir, args.expname, f)
for f in sorted(os.listdir(os.path.join(args.basedir, args.expname))) if f.endswith('.pth')]
def path2iter(path):
tmp = os.path.basename(path)[:-4]
idx = tmp.rfind('_')
return int(tmp[idx + 1:])
ckpts = sorted(ckpts, key=path2iter)
logger.info('Found ckpts: {}'.format(ckpts))
if len(ckpts) > 0 and not args.no_reload:
fpath = ckpts[-1]
logger.info('Reloading from: {}'.format(fpath))
start = path2iter(fpath)
# configure map_location properly for different processes
map_location = {'cuda:%d' % 0: 'cuda:%d' % rank}
to_load = torch.load(fpath, map_location=map_location)
for m in range(models['cascade_level']):
for name in ['net_{}'.format(m), 'optim_{}'.format(m)]:
models[name].load_state_dict(to_load[name])
models[name].load_state_dict(to_load[name])
render_splits = [x.strip() for x in args.render_splits.strip().split(',')]
# start testing
for split in render_splits:
out_dir = os.path.join(args.basedir, args.expname,
'render_{}_{:06d}'.format(split, start))
if rank == 0:
os.makedirs(out_dir, exist_ok=True)
###### load data and create ray samplers; each process should do this
ray_samplers = load_data_split(args.datadir, args.scene, split, try_load_min_depth=args.load_min_depth)
for idx in range(len(ray_samplers)):
### each process should do this; but only main process merges the results
fname = '{:06d}.png'.format(idx)
if ray_samplers[idx].img_path is not None:
fname = os.path.basename(ray_samplers[idx].img_path)
if os.path.isfile(os.path.join(out_dir, fname)):
logger.info('Skipping {}'.format(fname))
continue
time0 = time.time()
ret = render_single_image(rank, args.world_size, models, ray_samplers[idx], args.chunk_size)
dt = time.time() - time0
if rank == 0: # only main process should do this
logger.info('Rendered {} in {} seconds'.format(fname, dt))
# only save last level
im = ret[-1]['rgb'].numpy()
# compute psnr if ground-truth is available
if ray_samplers[idx].img_path is not None:
gt_im = ray_samplers[idx].get_img()
psnr = mse2psnr(np.mean((gt_im - im) * (gt_im - im)))
logger.info('{}: psnr={}'.format(fname, psnr))
im = to8b(im)
imageio.imwrite(os.path.join(out_dir, fname), im)
# im = ret[-1]['diffuse_rgb'].numpy()
# im = to8b(im)
# imageio.imwrite(os.path.join(out_dir, 'diffuse_' + fname), im)
im = ret[-1]['fg_rgb'].numpy()
im = to8b(im)
imageio.imwrite(os.path.join(out_dir, 'fg_' + fname), im)
im = ret[-1]['bg_rgb'].numpy()
im = to8b(im)
imageio.imwrite(os.path.join(out_dir, 'bg_' + fname), im)
im = ret[-1]['fg_depth'].numpy()
im = colorize_np(im, cmap_name='jet', append_cbar=True)
im = to8b(im)
imageio.imwrite(os.path.join(out_dir, 'fg_depth_' + fname), im)
im = ret[-1]['bg_depth'].numpy()
im = colorize_np(im, cmap_name='jet', append_cbar=True)
im = to8b(im)
imageio.imwrite(os.path.join(out_dir, 'bg_depth_' + fname), im)
torch.cuda.empty_cache()
# clean up for multi-processing
cleanup()
def test():
parser = config_parser()
args = parser.parse_args()
logger.info(parser.format_values())
if args.world_size == -1:
args.world_size = torch.cuda.device_count()
logger.info('Using # gpus: {}'.format(args.world_size))
torch.multiprocessing.spawn(ddp_test_nerf,
args=(args,),
nprocs=args.world_size,
join=True)
if __name__ == '__main__':
setup_logger()
test()