@inproceedings{297346f87b6948e7a24ed0746f9952fb,
title = "Xonar: Profiling-based Job Orderer for Distributed Deep Learning",
abstract = "Deep learning models have a wide spectrum of GPU execution time and memory size. When running distributed training jobs, however, their GPU execution time and memory size have not been taken into account, which leads to the high variance of job completion time (JCT). Moreover, the jobs often run into the GPU out-of-memory (OoM) problem so that the unlucky job has to restart all over. To address the problems, we propose Xonar to profile the deep learning jobs and order them in the queue. The experiments show that Xonar with TensorFlow v1.6 reduces the tail JCT by 44% with the OoM problem eliminated.",
keywords = "distributed deep learning, GPU cloud, GPU utilization, job completion time, parallel training",
author = "Changyong Shin and Gyeongsik Yang and Yeonho Yoo and Jeunghwan Lee and Chuck Yoo",
note = "Funding Information: This work was partly supported by Institute of Information & communications Technology Planning & Evaluation funded by the Korea government (Ministry of Science and ICT) (2015-0-00280, (SW Starlab) Next generation cloud infra-software toward the guarantee of performance and security SLA) and by Basic Science Research Program through National Research Foundation of Korea funded by the Ministry of Education (NRF-2021R1A 6A1A13044830). Co-corresponding authors: Chuck Yoo and Gyeongsik Yang. Publisher Copyright: {\textcopyright} 2022 IEEE.; 15th IEEE International Conference on Cloud Computing, CLOUD 2022 ; Conference date: 10-07-2021 Through 16-07-2021",
year = "2022",
doi = "10.1109/CLOUD55607.2022.00030",
language = "English",
series = "IEEE International Conference on Cloud Computing, CLOUD",
publisher = "IEEE Computer Society",
pages = "112--114",
editor = "Ardagna, {Claudio Agostino} and Nimanthi Atukorala and Rajkumar Buyya and Chang, {Carl K.} and Chang, {Rong N.} and Ernesto Damiani and Dasgupta, {Gargi Banerjee} and Fabrizio Gagliardi and Christoph Hagleitner and Dejan Milojicic and Trong, {Tuan M Hoang} and Robert Ward and Fatos Xhafa and Jia Zhang",
booktitle = "Proceedings - 2022 IEEE 15th International Conference on Cloud Computing, CLOUD 2022",
}