From 1904e48dba53402b54e148bf0bd517b72e9cabef Mon Sep 17 00:00:00 2001 From: Deyu Fu Date: Fri, 4 Oct 2019 11:21:36 -0700 Subject: [PATCH] move previous fused_adam and fp16_optimizer to contrib (#517) * move previous fused_adam and fp16_optimizer to contrib * make build contrib.fused_adam optional * change build option name * remove unnecessary try import --- apex/amp/_initialize.py | 2 +- .../csrc/optimizers}/fused_adam_cuda.cpp | 0 .../optimizers}/fused_adam_cuda_kernel.cu | 0 apex/contrib/optimizers/__init__.py | 2 + .../optimizers/fp16_optimizer.py | 0 apex/contrib/optimizers/fused_adam.py | 197 ++++++++++++++++++ apex/optimizers/__init__.py | 1 - setup.py | 26 ++- 8 files changed, 219 insertions(+), 9 deletions(-) rename {csrc => apex/contrib/csrc/optimizers}/fused_adam_cuda.cpp (100%) rename {csrc => apex/contrib/csrc/optimizers}/fused_adam_cuda_kernel.cu (100%) create mode 100644 apex/contrib/optimizers/__init__.py rename apex/{ => contrib}/optimizers/fp16_optimizer.py (100%) create mode 100644 apex/contrib/optimizers/fused_adam.py diff --git a/apex/amp/_initialize.py b/apex/amp/_initialize.py index a071aa818..19cde79bb 100644 --- a/apex/amp/_initialize.py +++ b/apex/amp/_initialize.py @@ -9,7 +9,7 @@ from ._process_optimizer import _process_optimizer from apex.fp16_utils import convert_network from ..fp16_utils import FP16_Optimizer as FP16_Optimizer_general -from ..optimizers import FP16_Optimizer as FP16_Optimizer_for_fused +from ..contrib.optimizers import FP16_Optimizer as FP16_Optimizer_for_fused from ..parallel import DistributedDataParallel as apex_DDP from ..parallel.LARC import LARC diff --git a/csrc/fused_adam_cuda.cpp b/apex/contrib/csrc/optimizers/fused_adam_cuda.cpp similarity index 100% rename from csrc/fused_adam_cuda.cpp rename to apex/contrib/csrc/optimizers/fused_adam_cuda.cpp diff --git a/csrc/fused_adam_cuda_kernel.cu b/apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu similarity index 100% rename from csrc/fused_adam_cuda_kernel.cu rename to apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu diff --git a/apex/contrib/optimizers/__init__.py b/apex/contrib/optimizers/__init__.py new file mode 100644 index 000000000..fdac76ca8 --- /dev/null +++ b/apex/contrib/optimizers/__init__.py @@ -0,0 +1,2 @@ +from .fp16_optimizer import FP16_Optimizer +from .fused_adam import FusedAdam diff --git a/apex/optimizers/fp16_optimizer.py b/apex/contrib/optimizers/fp16_optimizer.py similarity index 100% rename from apex/optimizers/fp16_optimizer.py rename to apex/contrib/optimizers/fp16_optimizer.py diff --git a/apex/contrib/optimizers/fused_adam.py b/apex/contrib/optimizers/fused_adam.py new file mode 100644 index 000000000..d56ff5a16 --- /dev/null +++ b/apex/contrib/optimizers/fused_adam.py @@ -0,0 +1,197 @@ +import types +import torch +import importlib +from apex.multi_tensor_apply import multi_tensor_applier + +class FusedAdam(torch.optim.Optimizer): + + """Implements Adam algorithm. Currently GPU-only. Requires Apex to be installed via + ``python setup.py install --cuda_ext --cpp_ext``. + + It has been proposed in `Adam: A Method for Stochastic Optimization`_. + + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups. + lr (float, optional): learning rate. (default: 1e-3) + betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its square. (default: (0.9, 0.999)) + eps (float, optional): term added to the denominator to improve + numerical stability. (default: 1e-8) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + amsgrad (boolean, optional): whether to use the AMSGrad variant of this + algorithm from the paper `On the Convergence of Adam and Beyond`_ + (default: False) NOT SUPPORTED in FusedAdam! + eps_inside_sqrt (boolean, optional): in the 'update parameters' step, + adds eps to the bias-corrected second moment estimate before + evaluating square root instead of adding it to the square root of + second moment estimate as in the original paper. (default: False) + use_mt (boolean, optional): use multi tensor apply for lower launch + latency. (default: False) + + .. _Adam\: A Method for Stochastic Optimization: + https://arxiv.org/abs/1412.6980 + .. _On the Convergence of Adam and Beyond: + https://openreview.net/forum?id=ryQu7f-RZ + """ + + def __init__(self, params, + lr=1e-3, bias_correction = True, + betas=(0.9, 0.999), eps=1e-8, eps_inside_sqrt = False, + weight_decay=0., max_grad_norm=0., amsgrad=False, use_mt=False, + amp_scale_adjustment=1.0): + global fused_adam_cuda + fused_adam_cuda = importlib.import_module("fused_adam_cuda") + + self._use_multi_tensor = False + if use_mt: + if not multi_tensor_applier.available: + print("Warning: multi_tensor_applier is unavailable") + else: + self._use_multi_tensor = True + self._overflow_buf = torch.cuda.IntTensor([0]) + + self._amp_scale_adjustment = amp_scale_adjustment + + if amsgrad: + raise RuntimeError('FusedAdam does not support the AMSGrad variant.') + defaults = dict(lr=lr, bias_correction=bias_correction, + betas=betas, eps=eps, weight_decay=weight_decay, + max_grad_norm=max_grad_norm) + super(FusedAdam, self).__init__(params, defaults) + self.eps_mode = 0 if eps_inside_sqrt else 1 + + def step(self, closure=None, grads=None, output_params=None, scale=1., grad_norms=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + grads (list of tensors, optional): weight gradient to use for the + optimizer update. If gradients have type torch.half, parameters + are expected to be in type torch.float. (default: None) + output params (list of tensors, optional): A reduced precision copy + of the updated weights written out in addition to the regular + updated weights. Have to be of same type as gradients. (default: None) + scale (float, optional): factor to divide gradient tensor values + by before applying to weights. (default: 1) + """ + loss = None + if closure is not None: + loss = closure() + + if hasattr(self, "_amp_stash"): + grads = self._amp_stash.grads + output_params = self._amp_stash.output_params + scale = self._amp_stash.scale*self._amp_scale_adjustment + grad_norms = self._amp_stash.grad_norms + + if grads is None: + grads_group = [None]*len(self.param_groups) + # backward compatibility + # assuming a list/generator of parameter means single group + elif isinstance(grads, types.GeneratorType): + grads_group = [grads] + elif type(grads[0])!=list: + grads_group = [grads] + else: + grads_group = grads + + if output_params is None: + output_params_group = [None]*len(self.param_groups) + elif isinstance(output_params, types.GeneratorType): + output_params_group = [output_params] + elif type(output_params[0])!=list: + output_params_group = [output_params] + else: + output_params_group = output_params + + if grad_norms is None: + grad_norms = [None]*len(self.param_groups) + + for group, grads_this_group, output_params_this_group, grad_norm in zip(self.param_groups, grads_group, output_params_group, grad_norms): + if grads_this_group is None: + grads_this_group = [None]*len(group['params']) + if output_params_this_group is None: + output_params_this_group = [None]*len(group['params']) + + # compute combined scale factor for this group + combined_scale = scale + if group['max_grad_norm'] > 0: + # norm is in fact norm*scale + clip = ((grad_norm / scale) + 1e-6) / group['max_grad_norm'] + if clip > 1: + combined_scale = clip * scale + + bias_correction = 1 if group['bias_correction'] else 0 + + if self._use_multi_tensor: + if output_params: + tensorlists = [[],[],[],[],[]] + else: + tensorlists = [[],[],[],[]] + + for p, grad, output_param in zip(group['params'], grads_this_group, output_params_this_group): + #note: p.grad should not ever be set for correct operation of mixed precision optimizer that sometimes sends None gradients + if p.grad is None and grad is None: + continue + if grad is None: + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError('FusedAdam does not support sparse gradients, please consider SparseAdam instead') + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p.data) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p.data) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['betas'] + + state['step'] += 1 + + out_p = torch.tensor([], dtype = torch.float) if output_param is None else output_param + if self._use_multi_tensor: + pl = [p.data, exp_avg, exp_avg_sq, grad] + if output_param is not None: + pl.append(out_p) + + for tl, t in zip(tensorlists, pl): + tl.append(t) + else: + fused_adam_cuda.adam(p.data, + out_p, + exp_avg, + exp_avg_sq, + grad, + group['lr'], + beta1, + beta2, + group['eps'], + combined_scale, + state['step'], + self.eps_mode, + bias_correction, + group['weight_decay']) + + if self._use_multi_tensor: + multi_tensor_applier( + fused_adam_cuda.adam_mt, + self._overflow_buf, + tensorlists, + group['lr'], + beta1, + beta2, + group['eps'], + combined_scale, + state['step'], + self.eps_mode, + bias_correction, + group['weight_decay']) + + return loss diff --git a/apex/optimizers/__init__.py b/apex/optimizers/__init__.py index 96c435d52..420981118 100644 --- a/apex/optimizers/__init__.py +++ b/apex/optimizers/__init__.py @@ -2,4 +2,3 @@ from .fused_adam import FusedAdam from .fused_novograd import FusedNovoGrad from .fused_lamb import FusedLAMB -from .fp16_optimizer import FP16_Optimizer diff --git a/setup.py b/setup.py index 53b84d08e..f14de5223 100644 --- a/setup.py +++ b/setup.py @@ -116,13 +116,6 @@ def check_cuda_torch_binary_vs_bare_metal(cuda_dir): '-O3', # '--resource-usage', '--use_fast_math'] + version_dependent_macros})) - ext_modules.append( - CUDAExtension(name='fused_adam_cuda', - sources=['csrc/fused_adam_cuda.cpp', - 'csrc/fused_adam_cuda_kernel.cu'], - extra_compile_args={'cxx': ['-O3',] + version_dependent_macros, - 'nvcc':['-O3', - '--use_fast_math'] + version_dependent_macros})) ext_modules.append( CUDAExtension(name='syncbn', sources=['csrc/syncbn.cpp', @@ -182,6 +175,25 @@ def check_cuda_torch_binary_vs_bare_metal(cuda_dir): extra_compile_args={'cxx': ['-O3'] + version_dependent_macros, 'nvcc':['-O3'] + version_dependent_macros})) +if "--deprecated_fused_adam" in sys.argv: + from torch.utils.cpp_extension import CUDAExtension + sys.argv.remove("--deprecated_fused_adam") + + from torch.utils.cpp_extension import BuildExtension + cmdclass['build_ext'] = BuildExtension + + if torch.utils.cpp_extension.CUDA_HOME is None: + raise RuntimeError("--deprecated_fused_adam was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.") + else: + ext_modules.append( + CUDAExtension(name='fused_adam_cuda', + sources=['apex/contrib/csrc/optimizers/fused_adam_cuda.cpp', + 'apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu'], + include_dirs=['csrc'], + extra_compile_args={'cxx': ['-O3',] + version_dependent_macros, + 'nvcc':['-O3', + '--use_fast_math'] + version_dependent_macros})) + setup( name='apex', version='0.1',