nerf_plus_plus/ddp_test_nerf.py

import torch
import torch.nn as nn
import torch.optim
import torch.distributed
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.multiprocessing
import numpy as np

import os
from collections import OrderedDict
from ddp_model import NerfNet
import time
from data_loader_split import load_data_split
from utils import mse2psnr, img_HWC2CHW, colorize, colorize_np, TINY_NUMBER, to8b
import imageio
from ddp_run_nerf import config_parser

import logging


logger = logging.getLogger(__package__)


def setup_logger():
    # create logger
    logger = logging.getLogger(__package__)
    logger.setLevel(logging.DEBUG)

    # create console handler and set level to debug
    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)

    # create formatter
    formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(name)s: %(message)s')

    # add formatter to ch
    ch.setFormatter(formatter)

    # add ch to logger
    logger.addHandler(ch)


def intersect_sphere(ray_o, ray_d):
    '''
    ray_o, ray_d: [..., 3]
    compute the depth of the intersection point between this ray and unit sphere
    '''
    # note: d1 becomes negative if this mid point is behind camera
    d1 = -torch.sum(ray_d * ray_o, dim=-1) / torch.sum(ray_d * ray_d, dim=-1)
    p = ray_o + d1.unsqueeze(-1) * ray_d
    # consider the case where the ray does not intersect the sphere
    ray_d_cos = 1. / torch.norm(ray_d, dim=-1)
    d2 = torch.sqrt(1. - torch.sum(p * p, dim=-1)) * ray_d_cos

    return d1 + d2


def perturb_samples(z_vals):
    # get intervals between samples
    mids = .5 * (z_vals[..., 1:] + z_vals[..., :-1])
    upper = torch.cat([mids, z_vals[..., -1:]], dim=-1)
    lower = torch.cat([z_vals[..., 0:1], mids], dim=-1)
    # uniform samples in those intervals
    t_rand = torch.rand_like(z_vals)
    z_vals = lower + (upper - lower) * t_rand  # [N_rays, N_samples]

    return z_vals


def sample_pdf(bins, weights, N_samples, det=False):
    '''
    :param bins: tensor of shape [..., M+1], M is the number of bins
    :param weights: tensor of shape [..., M]
    :param N_samples: number of samples along each ray
    :param det: if True, will perform deterministic sampling
    :return: [..., N_samples]
    '''
    # Get pdf
    weights = weights + TINY_NUMBER      # prevent nans
    pdf = weights / torch.sum(weights, dim=-1, keepdim=True)    # [..., M]
    cdf = torch.cumsum(pdf, dim=-1)                             # [..., M]
    cdf = torch.cat([torch.zeros_like(cdf[..., 0:1]), cdf], dim=-1)     # [..., M+1]

    # Take uniform samples
    dots_sh = list(weights.shape[:-1])
    M = weights.shape[-1]

    min_cdf = 0.00
    max_cdf = 1.00       # prevent outlier samples

    if det:
        u = torch.linspace(min_cdf, max_cdf, N_samples, device=bins.device)
        u = u.view([1]*len(dots_sh) + [N_samples]).expand(dots_sh + [N_samples,])   # [..., N_samples]
    else:
        sh = dots_sh + [N_samples]
        u = torch.rand(*sh, device=bins.device) * (max_cdf - min_cdf) + min_cdf        # [..., N_samples]

    # Invert CDF
    # [..., N_samples, 1] >= [..., 1, M] ----> [..., N_samples, M] ----> [..., N_samples,]
    above_inds = torch.sum(u.unsqueeze(-1) >= cdf[..., :M].unsqueeze(-2), dim=-1).long()

    # random sample inside each bin
    below_inds = torch.clamp(above_inds-1, min=0)
    inds_g = torch.stack((below_inds, above_inds), dim=-1)     # [..., N_samples, 2]

    cdf = cdf.unsqueeze(-2).expand(dots_sh + [N_samples, M+1])   # [..., N_samples, M+1]
    cdf_g = torch.gather(input=cdf, dim=-1, index=inds_g)       # [..., N_samples, 2]

    bins = bins.unsqueeze(-2).expand(dots_sh + [N_samples, M+1])    # [..., N_samples, M+1]
    bins_g = torch.gather(input=bins, dim=-1, index=inds_g)  # [..., N_samples, 2]

    # fix numeric issue
    denom = cdf_g[..., 1] - cdf_g[..., 0]      # [..., N_samples]
    denom = torch.where(denom<TINY_NUMBER, torch.ones_like(denom), denom)
    t = (u - cdf_g[..., 0]) / denom

    samples = bins_g[..., 0] + t * (bins_g[..., 1] - bins_g[..., 0] + TINY_NUMBER)

    return samples


def render_single_image(rank, world_size, models, ray_sampler, chunk_size):
    ##### parallel rendering of a single image
    ray_batch = ray_sampler.get_all()
    # split into ranks; make sure different processes don't overlap
    rank_split_sizes = [ray_batch['ray_d'].shape[0] // world_size, ] * world_size
    rank_split_sizes[-1] = ray_batch['ray_d'].shape[0] - sum(rank_split_sizes[:-1])
    for key in ray_batch:
        if torch.is_tensor(ray_batch[key]):
            ray_batch[key] = torch.split(ray_batch[key], rank_split_sizes)[rank].to(rank)

    # split into chunks and render inside each process
    ray_batch_split = OrderedDict()
    for key in ray_batch:
        if torch.is_tensor(ray_batch[key]):
            ray_batch_split[key] = torch.split(ray_batch[key], chunk_size)

    # forward and backward
    ret_merge_chunk = [OrderedDict() for _ in range(models['cascade_level'])]
    for s in range(len(ray_batch_split['ray_d'])):
        ray_o = ray_batch_split['ray_o'][s]
        ray_d = ray_batch_split['ray_d'][s]
        min_depth = ray_batch_split['min_depth'][s]

        dots_sh = list(ray_d.shape[:-1])
        for m in range(models['cascade_level']):
            net = models['net_{}'.format(m)]
            # sample depths
            N_samples = models['cascade_samples'][m]
            if m == 0:
                # foreground depth
                fg_far_depth = intersect_sphere(ray_o, ray_d)  # [...,]
                # fg_near_depth = 0.18 * torch.ones_like(fg_far_depth)
                fg_near_depth = min_depth  # [..., 3]
                step = (fg_far_depth - fg_near_depth) / (N_samples - 1)
                fg_depth = torch.stack([fg_near_depth + i * step for i in range(N_samples)], dim=-1)  # [..., N_samples]

                # background depth
                bg_depth = torch.linspace(0., 1., N_samples).view(
                    [1, ] * len(dots_sh) + [N_samples,]).expand(dots_sh + [N_samples,]).to(rank)

                # delete unused memory
                del fg_near_depth
                del step
                torch.cuda.empty_cache()
            else:
                # sample pdf and concat with earlier samples
                fg_weights = ret['fg_weights'].clone().detach()
                fg_depth_mid = .5 * (fg_depth[..., 1:] + fg_depth[..., :-1])    # [..., N_samples-1]
                fg_weights = fg_weights[..., 1:-1]                              # [..., N_samples-2]
                fg_depth_samples = sample_pdf(bins=fg_depth_mid, weights=fg_weights,
                                              N_samples=N_samples, det=True)    # [..., N_samples]
                fg_depth, _ = torch.sort(torch.cat((fg_depth, fg_depth_samples), dim=-1))

                # sample pdf and concat with earlier samples
                bg_weights = ret['bg_weights'].clone().detach()
                bg_depth_mid = .5 * (bg_depth[..., 1:] + bg_depth[..., :-1])
                bg_weights = bg_weights[..., 1:-1]                              # [..., N_samples-2]
                bg_depth_samples = sample_pdf(bins=bg_depth_mid, weights=bg_weights,
                                              N_samples=N_samples, det=True)    # [..., N_samples]
                bg_depth, _ = torch.sort(torch.cat((bg_depth, bg_depth_samples), dim=-1))

                # delete unused memory
                del fg_weights
                del fg_depth_mid
                del fg_depth_samples
                del bg_weights
                del bg_depth_mid
                del bg_depth_samples
                torch.cuda.empty_cache()

            with torch.no_grad():
                ret = net(ray_o, ray_d, fg_far_depth, fg_depth, bg_depth)

            for key in ret:
                if key not in ['fg_weights', 'bg_weights']:
                    if torch.is_tensor(ret[key]):
                        if key not in ret_merge_chunk[m]:
                            ret_merge_chunk[m][key] = [ret[key].cpu(), ]
                        else:
                            ret_merge_chunk[m][key].append(ret[key].cpu())

                        ret[key] = None

            # clean unused memory
            torch.cuda.empty_cache()

    # merge results from different chunks
    for m in range(len(ret_merge_chunk)):
        for key in ret_merge_chunk[m]:
            ret_merge_chunk[m][key] = torch.cat(ret_merge_chunk[m][key], dim=0)

    # merge results from different processes
    if rank == 0:
        ret_merge_rank = [OrderedDict() for _ in range(len(ret_merge_chunk))]
        for m in range(len(ret_merge_chunk)):
            for key in ret_merge_chunk[m]:
                # generate tensors to store results from other processes
                sh = list(ret_merge_chunk[m][key].shape[1:])
                ret_merge_rank[m][key] = [torch.zeros(*[size,]+sh, dtype=torch.float32) for size in rank_split_sizes]
                torch.distributed.gather(ret_merge_chunk[m][key], ret_merge_rank[m][key])
                ret_merge_rank[m][key] = torch.cat(ret_merge_rank[m][key], dim=0).reshape(
                                            (ray_sampler.H, ray_sampler.W, -1)).squeeze()
                # print(m, key, ret_merge_rank[m][key].shape)
    else:  # send results to main process
        for m in range(len(ret_merge_chunk)):
            for key in ret_merge_chunk[m]:
                torch.distributed.gather(ret_merge_chunk[m][key])

    # only rank 0 program returns
    if rank == 0:
        return ret_merge_rank
    else:
        return None


def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    # initialize the process group
    torch.distributed.init_process_group("gloo", rank=rank, world_size=world_size)


def cleanup():
    torch.distributed.destroy_process_group()


def ddp_test_nerf(rank, args):
    ###### set up multi-processing
    setup(rank, args.world_size)
    ###### set up logger
    logger = logging.getLogger(__package__)
    setup_logger()

    ###### decide chunk size according to gpu memory
    if torch.cuda.get_device_properties(rank).total_memory / 1e9 > 14:
        logger.info('setting batch size according to 24G gpu')
        args.N_rand = 1024
        args.chunk_size = 8192
    else:
        logger.info('setting batch size according to 12G gpu')
        args.N_rand = 512
        args.chunk_size = 4096

    ###### create network and wrap in ddp; each process should do this
    # fix random seed just to make sure the network is initialized with same weights at different processes
    torch.manual_seed(777)
    # very important!!! otherwise it might introduce extra memory in rank=0 gpu
    torch.cuda.set_device(rank)

    models = OrderedDict()
    models['cascade_level'] = args.cascade_level
    models['cascade_samples'] = [int(x.strip()) for x in args.cascade_samples.split(',')]
    for m in range(models['cascade_level']):
        net = NerfNet(args).to(rank)
        net = DDP(net, device_ids=[rank], output_device=rank)
        optim = torch.optim.Adam(net.parameters(), lr=args.lrate)
        models['net_{}'.format(m)] = net
        models['optim_{}'.format(m)] = optim

    start = -1

    ###### load pretrained weights; each process should do this
    if (args.ckpt_path is not None) and (os.path.isfile(args.ckpt_path)):
        ckpts = [args.ckpt_path]
    else:
        ckpts = [os.path.join(args.basedir, args.expname, f)
                 for f in sorted(os.listdir(os.path.join(args.basedir, args.expname))) if f.endswith('.pth')]
    def path2iter(path):
        tmp = os.path.basename(path)[:-4]
        idx = tmp.rfind('_')
        return int(tmp[idx + 1:])
    ckpts = sorted(ckpts, key=path2iter)
    logger.info('Found ckpts: {}'.format(ckpts))
    if len(ckpts) > 0 and not args.no_reload:
        fpath = ckpts[-1]
        logger.info('Reloading from: {}'.format(fpath))
        start = path2iter(fpath)
        # configure map_location properly for different processes
        map_location = {'cuda:%d' % 0: 'cuda:%d' % rank}
        to_load = torch.load(fpath, map_location=map_location)
        for m in range(models['cascade_level']):
            for name in ['net_{}'.format(m), 'optim_{}'.format(m)]:
                models[name].load_state_dict(to_load[name])
                models[name].load_state_dict(to_load[name])

    render_splits = [x.strip() for x in args.render_splits.strip().split(',')]
    # start testing
    for split in render_splits:
        out_dir = os.path.join(args.basedir, args.expname,
                               'render_{}_{:06d}'.format(split, start))
        if rank == 0:
            os.makedirs(out_dir, exist_ok=True)

        ###### load data and create ray samplers; each process should do this
        ray_samplers = load_data_split(args.datadir, args.scene, split, try_load_min_depth=args.load_min_depth)
        for idx in range(len(ray_samplers)):
            ### each process should do this; but only main process merges the results
            fname = '{:06d}.png'.format(idx)
            if ray_samplers[idx].img_path is not None:
                fname = os.path.basename(ray_samplers[idx].img_path)

            if os.path.isfile(os.path.join(out_dir, fname)):
                logger.info('Skipping {}'.format(fname))
                continue

            time0 = time.time()
            ret = render_single_image(rank, args.world_size, models, ray_samplers[idx], args.chunk_size)
            dt = time.time() - time0
            if rank == 0:    # only main process should do this

                logger.info('Rendered {} in {} seconds'.format(fname, dt))

                # only save last level
                im = ret[-1]['rgb'].numpy()
                # compute psnr if ground-truth is available
                if ray_samplers[idx].img_path is not None:
                    gt_im = ray_samplers[idx].get_img()
                    psnr = mse2psnr(np.mean((gt_im - im) * (gt_im - im)))
                    logger.info('{}: psnr={}'.format(fname, psnr))

                im = to8b(im)
                imageio.imwrite(os.path.join(out_dir, fname), im)

                # im = ret[-1]['diffuse_rgb'].numpy()
                # im = to8b(im)
                # imageio.imwrite(os.path.join(out_dir, 'diffuse_' + fname), im)

                im = ret[-1]['fg_rgb'].numpy()
                im = to8b(im)
                imageio.imwrite(os.path.join(out_dir, 'fg_' + fname), im)

                im = ret[-1]['bg_rgb'].numpy()
                im = to8b(im)
                imageio.imwrite(os.path.join(out_dir, 'bg_' + fname), im)

                im = ret[-1]['fg_depth'].numpy()
                im = colorize_np(im, cmap_name='jet', append_cbar=True)
                im = to8b(im)
                imageio.imwrite(os.path.join(out_dir, 'fg_depth_' + fname), im)

                im = ret[-1]['bg_depth'].numpy()
                im = colorize_np(im, cmap_name='jet', append_cbar=True)
                im = to8b(im)
                imageio.imwrite(os.path.join(out_dir, 'bg_depth_' + fname), im)

            torch.cuda.empty_cache()

    # clean up for multi-processing
    cleanup()


def test():
    parser = config_parser()
    args = parser.parse_args()
    logger.info(parser.format_values())

    if args.world_size == -1:
        args.world_size = torch.cuda.device_count()
        logger.info('Using # gpus: {}'.format(args.world_size))
    torch.multiprocessing.spawn(ddp_test_nerf,
                                args=(args,),
                                nprocs=args.world_size,
                                join=True)


if __name__ == '__main__':
    setup_logger()
    test()
first commit 4 years ago			`import torch`
			`import torch.nn as nn`
			`import torch.optim`
			`import torch.distributed`
			`from torch.nn.parallel import DistributedDataParallel as DDP`
			`import torch.multiprocessing`
			`import numpy as np`

			`import os`
			`from collections import OrderedDict`
			`from ddp_model import NerfNet`
			`import time`
			`from data_loader_split import load_data_split`
			`from utils import mse2psnr, img_HWC2CHW, colorize, colorize_np, TINY_NUMBER, to8b`
			`import imageio`
			`from ddp_run_nerf import config_parser`

			`import logging`


			`logger = logging.getLogger(__package__)`


			`def setup_logger():`
			`# create logger`
			`logger = logging.getLogger(__package__)`
			`logger.setLevel(logging.DEBUG)`

			`# create console handler and set level to debug`
			`ch = logging.StreamHandler()`
			`ch.setLevel(logging.INFO)`

			`# create formatter`
			`formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(name)s: %(message)s')`

			`# add formatter to ch`
			`ch.setFormatter(formatter)`

			`# add ch to logger`
			`logger.addHandler(ch)`


			`def intersect_sphere(ray_o, ray_d):`
			`'''`
			`ray_o, ray_d: [..., 3]`
			`compute the depth of the intersection point between this ray and unit sphere`
			`'''`
			`# note: d1 becomes negative if this mid point is behind camera`
			`d1 = -torch.sum(ray_d * ray_o, dim=-1) / torch.sum(ray_d * ray_d, dim=-1)`
			`p = ray_o + d1.unsqueeze(-1) * ray_d`
			`# consider the case where the ray does not intersect the sphere`
			`ray_d_cos = 1. / torch.norm(ray_d, dim=-1)`
			`d2 = torch.sqrt(1. - torch.sum(p * p, dim=-1)) * ray_d_cos`

			`return d1 + d2`


			`def perturb_samples(z_vals):`
			`# get intervals between samples`
			`mids = .5 * (z_vals[..., 1:] + z_vals[..., :-1])`
			`upper = torch.cat([mids, z_vals[..., -1:]], dim=-1)`
			`lower = torch.cat([z_vals[..., 0:1], mids], dim=-1)`
			`# uniform samples in those intervals`
			`t_rand = torch.rand_like(z_vals)`
			`z_vals = lower + (upper - lower) * t_rand # [N_rays, N_samples]`

			`return z_vals`


			`def sample_pdf(bins, weights, N_samples, det=False):`
			`'''`
			`:param bins: tensor of shape [..., M+1], M is the number of bins`
			`:param weights: tensor of shape [..., M]`
			`:param N_samples: number of samples along each ray`
			`:param det: if True, will perform deterministic sampling`
			`:return: [..., N_samples]`
			`'''`
			`# Get pdf`
			`weights = weights + TINY_NUMBER # prevent nans`
			`pdf = weights / torch.sum(weights, dim=-1, keepdim=True) # [..., M]`
			`cdf = torch.cumsum(pdf, dim=-1) # [..., M]`
			`cdf = torch.cat([torch.zeros_like(cdf[..., 0:1]), cdf], dim=-1) # [..., M+1]`

			`# Take uniform samples`
			`dots_sh = list(weights.shape[:-1])`
			`M = weights.shape[-1]`

			`min_cdf = 0.00`
			`max_cdf = 1.00 # prevent outlier samples`

			`if det:`
			`u = torch.linspace(min_cdf, max_cdf, N_samples, device=bins.device)`
			`u = u.view([1]*len(dots_sh) + [N_samples]).expand(dots_sh + [N_samples,]) # [..., N_samples]`
			`else:`
			`sh = dots_sh + [N_samples]`
			`u = torch.rand(sh, device=bins.device) (max_cdf - min_cdf) + min_cdf # [..., N_samples]`

			`# Invert CDF`
			`# [..., N_samples, 1] >= [..., 1, M] ----> [..., N_samples, M] ----> [..., N_samples,]`
			`above_inds = torch.sum(u.unsqueeze(-1) >= cdf[..., :M].unsqueeze(-2), dim=-1).long()`

			`# random sample inside each bin`
			`below_inds = torch.clamp(above_inds-1, min=0)`
			`inds_g = torch.stack((below_inds, above_inds), dim=-1) # [..., N_samples, 2]`

			`cdf = cdf.unsqueeze(-2).expand(dots_sh + [N_samples, M+1]) # [..., N_samples, M+1]`
			`cdf_g = torch.gather(input=cdf, dim=-1, index=inds_g) # [..., N_samples, 2]`

			`bins = bins.unsqueeze(-2).expand(dots_sh + [N_samples, M+1]) # [..., N_samples, M+1]`
			`bins_g = torch.gather(input=bins, dim=-1, index=inds_g) # [..., N_samples, 2]`

			`# fix numeric issue`
			`denom = cdf_g[..., 1] - cdf_g[..., 0] # [..., N_samples]`
			`denom = torch.where(denom<TINY_NUMBER, torch.ones_like(denom), denom)`
			`t = (u - cdf_g[..., 0]) / denom`

			`samples = bins_g[..., 0] + t * (bins_g[..., 1] - bins_g[..., 0] + TINY_NUMBER)`

			`return samples`


			`def render_single_image(rank, world_size, models, ray_sampler, chunk_size):`
			`##### parallel rendering of a single image`
			`ray_batch = ray_sampler.get_all()`
			`# split into ranks; make sure different processes don't overlap`
			`rank_split_sizes = [ray_batch['ray_d'].shape[0] // world_size, ] * world_size`
			`rank_split_sizes[-1] = ray_batch['ray_d'].shape[0] - sum(rank_split_sizes[:-1])`
			`for key in ray_batch:`
			`if torch.is_tensor(ray_batch[key]):`
			`ray_batch[key] = torch.split(ray_batch[key], rank_split_sizes)[rank].to(rank)`

			`# split into chunks and render inside each process`
			`ray_batch_split = OrderedDict()`
			`for key in ray_batch:`
			`if torch.is_tensor(ray_batch[key]):`
			`ray_batch_split[key] = torch.split(ray_batch[key], chunk_size)`

			`# forward and backward`
			`ret_merge_chunk = [OrderedDict() for _ in range(models['cascade_level'])]`
			`for s in range(len(ray_batch_split['ray_d'])):`
			`ray_o = ray_batch_split['ray_o'][s]`
			`ray_d = ray_batch_split['ray_d'][s]`
			`min_depth = ray_batch_split['min_depth'][s]`

			`dots_sh = list(ray_d.shape[:-1])`
			`for m in range(models['cascade_level']):`
			`net = models['net_{}'.format(m)]`
			`# sample depths`
			`N_samples = models['cascade_samples'][m]`
			`if m == 0:`
			`# foreground depth`
			`fg_far_depth = intersect_sphere(ray_o, ray_d) # [...,]`
			`# fg_near_depth = 0.18 * torch.ones_like(fg_far_depth)`
			`fg_near_depth = min_depth # [..., 3]`
			`step = (fg_far_depth - fg_near_depth) / (N_samples - 1)`
			`fg_depth = torch.stack([fg_near_depth + i * step for i in range(N_samples)], dim=-1) # [..., N_samples]`

			`# background depth`
			`bg_depth = torch.linspace(0., 1., N_samples).view(`
			`[1, ] * len(dots_sh) + [N_samples,]).expand(dots_sh + [N_samples,]).to(rank)`

			`# delete unused memory`
			`del fg_near_depth`
			`del step`
			`torch.cuda.empty_cache()`
			`else:`
			`# sample pdf and concat with earlier samples`
			`fg_weights = ret['fg_weights'].clone().detach()`
			`fg_depth_mid = .5 * (fg_depth[..., 1:] + fg_depth[..., :-1]) # [..., N_samples-1]`
			`fg_weights = fg_weights[..., 1:-1] # [..., N_samples-2]`
			`fg_depth_samples = sample_pdf(bins=fg_depth_mid, weights=fg_weights,`
			`N_samples=N_samples, det=True) # [..., N_samples]`
			`fg_depth, _ = torch.sort(torch.cat((fg_depth, fg_depth_samples), dim=-1))`

			`# sample pdf and concat with earlier samples`
			`bg_weights = ret['bg_weights'].clone().detach()`
			`bg_depth_mid = .5 * (bg_depth[..., 1:] + bg_depth[..., :-1])`
			`bg_weights = bg_weights[..., 1:-1] # [..., N_samples-2]`
			`bg_depth_samples = sample_pdf(bins=bg_depth_mid, weights=bg_weights,`
			`N_samples=N_samples, det=True) # [..., N_samples]`
			`bg_depth, _ = torch.sort(torch.cat((bg_depth, bg_depth_samples), dim=-1))`

			`# delete unused memory`
			`del fg_weights`
			`del fg_depth_mid`
			`del fg_depth_samples`
			`del bg_weights`
			`del bg_depth_mid`
			`del bg_depth_samples`
			`torch.cuda.empty_cache()`

			`with torch.no_grad():`
			`ret = net(ray_o, ray_d, fg_far_depth, fg_depth, bg_depth)`

			`for key in ret:`
			`if key not in ['fg_weights', 'bg_weights']:`
			`if torch.is_tensor(ret[key]):`
			`if key not in ret_merge_chunk[m]:`
			`ret_merge_chunk[m][key] = [ret[key].cpu(), ]`
			`else:`
			`ret_merge_chunk[m][key].append(ret[key].cpu())`

			`ret[key] = None`

			`# clean unused memory`
			`torch.cuda.empty_cache()`

			`# merge results from different chunks`
			`for m in range(len(ret_merge_chunk)):`
			`for key in ret_merge_chunk[m]:`
			`ret_merge_chunk[m][key] = torch.cat(ret_merge_chunk[m][key], dim=0)`

			`# merge results from different processes`
			`if rank == 0:`
			`ret_merge_rank = [OrderedDict() for _ in range(len(ret_merge_chunk))]`
			`for m in range(len(ret_merge_chunk)):`
			`for key in ret_merge_chunk[m]:`
			`# generate tensors to store results from other processes`
			`sh = list(ret_merge_chunk[m][key].shape[1:])`
			`ret_merge_rank[m][key] = [torch.zeros(*[size,]+sh, dtype=torch.float32) for size in rank_split_sizes]`
			`torch.distributed.gather(ret_merge_chunk[m][key], ret_merge_rank[m][key])`
			`ret_merge_rank[m][key] = torch.cat(ret_merge_rank[m][key], dim=0).reshape(`
			`(ray_sampler.H, ray_sampler.W, -1)).squeeze()`
			`# print(m, key, ret_merge_rank[m][key].shape)`
			`else: # send results to main process`
			`for m in range(len(ret_merge_chunk)):`
			`for key in ret_merge_chunk[m]:`
			`torch.distributed.gather(ret_merge_chunk[m][key])`

			`# only rank 0 program returns`
			`if rank == 0:`
			`return ret_merge_rank`
			`else:`
			`return None`


			`def setup(rank, world_size):`
			`os.environ['MASTER_ADDR'] = 'localhost'`
			`os.environ['MASTER_PORT'] = '12355'`
			`# initialize the process group`
			`torch.distributed.init_process_group("gloo", rank=rank, world_size=world_size)`


			`def cleanup():`
			`torch.distributed.destroy_process_group()`


			`def ddp_test_nerf(rank, args):`
			`###### set up multi-processing`
			`setup(rank, args.world_size)`
			`###### set up logger`
			`logger = logging.getLogger(__package__)`
			`setup_logger()`

			`###### decide chunk size according to gpu memory`
			`if torch.cuda.get_device_properties(rank).total_memory / 1e9 > 14:`
			`logger.info('setting batch size according to 24G gpu')`
			`args.N_rand = 1024`
			`args.chunk_size = 8192`
			`else:`
			`logger.info('setting batch size according to 12G gpu')`
			`args.N_rand = 512`
			`args.chunk_size = 4096`

			`###### create network and wrap in ddp; each process should do this`
			`# fix random seed just to make sure the network is initialized with same weights at different processes`
			`torch.manual_seed(777)`
			`# very important!!! otherwise it might introduce extra memory in rank=0 gpu`
			`torch.cuda.set_device(rank)`

			`models = OrderedDict()`
			`models['cascade_level'] = args.cascade_level`
			`models['cascade_samples'] = [int(x.strip()) for x in args.cascade_samples.split(',')]`
			`for m in range(models['cascade_level']):`
			`net = NerfNet(args).to(rank)`
			`net = DDP(net, device_ids=[rank], output_device=rank)`
			`optim = torch.optim.Adam(net.parameters(), lr=args.lrate)`
			`models['net_{}'.format(m)] = net`
			`models['optim_{}'.format(m)] = optim`

			`start = -1`

			`###### load pretrained weights; each process should do this`
			`if (args.ckpt_path is not None) and (os.path.isfile(args.ckpt_path)):`
			`ckpts = [args.ckpt_path]`
			`else:`
			`ckpts = [os.path.join(args.basedir, args.expname, f)`
			`for f in sorted(os.listdir(os.path.join(args.basedir, args.expname))) if f.endswith('.pth')]`
			`def path2iter(path):`
			`tmp = os.path.basename(path)[:-4]`
			`idx = tmp.rfind('_')`
			`return int(tmp[idx + 1:])`
			`ckpts = sorted(ckpts, key=path2iter)`
			`logger.info('Found ckpts: {}'.format(ckpts))`
			`if len(ckpts) > 0 and not args.no_reload:`
			`fpath = ckpts[-1]`
			`logger.info('Reloading from: {}'.format(fpath))`
			`start = path2iter(fpath)`
			`# configure map_location properly for different processes`
			`map_location = {'cuda:%d' % 0: 'cuda:%d' % rank}`
			`to_load = torch.load(fpath, map_location=map_location)`
			`for m in range(models['cascade_level']):`
			`for name in ['net_{}'.format(m), 'optim_{}'.format(m)]:`
			`models[name].load_state_dict(to_load[name])`
			`models[name].load_state_dict(to_load[name])`

			`render_splits = [x.strip() for x in args.render_splits.strip().split(',')]`
			`# start testing`
			`for split in render_splits:`
			`out_dir = os.path.join(args.basedir, args.expname,`
			`'render_{}_{:06d}'.format(split, start))`
			`if rank == 0:`
			`os.makedirs(out_dir, exist_ok=True)`

			`###### load data and create ray samplers; each process should do this`
			`ray_samplers = load_data_split(args.datadir, args.scene, split, try_load_min_depth=args.load_min_depth)`
			`for idx in range(len(ray_samplers)):`
			`### each process should do this; but only main process merges the results`
			`fname = '{:06d}.png'.format(idx)`
			`if ray_samplers[idx].img_path is not None:`
			`fname = os.path.basename(ray_samplers[idx].img_path)`

			`if os.path.isfile(os.path.join(out_dir, fname)):`
			`logger.info('Skipping {}'.format(fname))`
			`continue`

			`time0 = time.time()`
			`ret = render_single_image(rank, args.world_size, models, ray_samplers[idx], args.chunk_size)`
			`dt = time.time() - time0`
			`if rank == 0: # only main process should do this`

			`logger.info('Rendered {} in {} seconds'.format(fname, dt))`

			`# only save last level`
			`im = ret[-1]['rgb'].numpy()`
			`# compute psnr if ground-truth is available`
			`if ray_samplers[idx].img_path is not None:`
			`gt_im = ray_samplers[idx].get_img()`
			`psnr = mse2psnr(np.mean((gt_im - im) * (gt_im - im)))`
			`logger.info('{}: psnr={}'.format(fname, psnr))`

			`im = to8b(im)`
			`imageio.imwrite(os.path.join(out_dir, fname), im)`

			`# im = ret[-1]['diffuse_rgb'].numpy()`
			`# im = to8b(im)`
			`# imageio.imwrite(os.path.join(out_dir, 'diffuse_' + fname), im)`

			`im = ret[-1]['fg_rgb'].numpy()`
			`im = to8b(im)`
			`imageio.imwrite(os.path.join(out_dir, 'fg_' + fname), im)`

			`im = ret[-1]['bg_rgb'].numpy()`
			`im = to8b(im)`
			`imageio.imwrite(os.path.join(out_dir, 'bg_' + fname), im)`

			`im = ret[-1]['fg_depth'].numpy()`
			`im = colorize_np(im, cmap_name='jet', append_cbar=True)`
			`im = to8b(im)`
			`imageio.imwrite(os.path.join(out_dir, 'fg_depth_' + fname), im)`

			`im = ret[-1]['bg_depth'].numpy()`
			`im = colorize_np(im, cmap_name='jet', append_cbar=True)`
			`im = to8b(im)`
			`imageio.imwrite(os.path.join(out_dir, 'bg_depth_' + fname), im)`

			`torch.cuda.empty_cache()`

			`# clean up for multi-processing`
			`cleanup()`


			`def test():`
			`parser = config_parser()`
			`args = parser.parse_args()`
			`logger.info(parser.format_values())`

			`if args.world_size == -1:`
			`args.world_size = torch.cuda.device_count()`
			`logger.info('Using # gpus: {}'.format(args.world_size))`
			`torch.multiprocessing.spawn(ddp_test_nerf,`
			`args=(args,),`
			`nprocs=args.world_size,`
			`join=True)`


			`if __name__ == '__main__':`
			`setup_logger()`
			`test()`