@inproceedings{33a79e9d322b406aba8a9db235625c7e,
title = "A hierarchical, bulk-synchronous stochastic gradient descent algorithm for deep-learning applications on GPU clusters",
abstract = "The training data and models are becoming increasingly large in many deep-learning applications. Large-scale distributed processing is employed to accelerate training. Increasing the number of learners in synchronous and asynchronous stochastic gradient descent presents challenges to convergence and communication performance. We present our hierarchical, bulk-synchronous stochastic gradient algorithm that effectively balances execution time and accuracy for training in deep-learning applications on GPU clusters. It achieves much better convergence and execution time at scale in comparison to asynchronous stochastic gradient descent implementations. When deployed on a cluster of 128 GPUs, our implementation achieves up to 56 times speedups over the sequential stochastic gradient descent with similar test accuracy for our target application.",
keywords = "Deep learning, GPU, Stochastic gradient descent, distributed algorithm",
author = "Guojing Cong and Onkar Bhardwaj",
note = "Publisher Copyright: {\textcopyright} 2017 IEEE.; 16th IEEE International Conference on Machine Learning and Applications, ICMLA 2017 ; Conference date: 18-12-2017 Through 21-12-2017",
year = "2017",
doi = "10.1109/ICMLA.2017.00-56",
language = "English",
series = "Proceedings - 16th IEEE International Conference on Machine Learning and Applications, ICMLA 2017",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "818--821",
editor = "Xuewen Chen and Bo Luo and Feng Luo and Vasile Palade and Wani, {M. Arif}",
booktitle = "Proceedings - 16th IEEE International Conference on Machine Learning and Applications, ICMLA 2017",
}