@inbook{d4abd964a4e24090829b05b41fc8f7fb,
title = "Accumulated Gradient Normalization",
abstract = "This work addresses the instability in asynchronous data parallel optimization. It does so by introducing a novel distributed optimizer which is able to efficiently optimize a centralized model under communication constraints. The optimizer achieves this by pushing a normalized sequence of first-order gradients to a parameter server. This implies that the magnitude of a worker delta is smaller compared to an accumulated gradient, and provides a better direction towards a minimum compared to first-order gradients, which in turn also forces possible implicit momentum fluctuations to be more aligned since we make the assumption that all workers contribute towards a single minima. As a result, our approach mitigates the parameter staleness problem more effectively since staleness in asynchrony induces (implicit) momentum, and achieves a better convergence rate compared to other optimizers such as asynchronous textsceasgd and which we show empirically.",
author = "Hermans, {Joeri R.} and Gerasimos Spanakis and Rico M{\"o}ckel",
year = "2017",
month = nov,
language = "English",
volume = "77",
series = "Proceedings of Machine Learning Research",
publisher = "Proceedings of Machine Learning Research",
pages = "439--454",
editor = "Min-Ling Zhang and Yung-Kyun Noh",
booktitle = "Proceedings of the 9th Asian Conference on Machine Learning",
}