@inproceedings{845f30522e444042a21088d27c9fec94,
title = "Understanding Node Allocation on Leadership-Class Supercomputers with Graph Analytics",
abstract = "As the scale of modern high-performance computing (HPC) systems keeps growing, job scheduling on those systems becomes extremely challenging. Particularly, one of the important tasks job schedulers need to fulfill is to optimize the node allocation to improve the jobs' execution efficiency. In order to optimize the node allocation, the job scheduling strategy must take the network topology of the HPC system into consideration. However, existing approaches are either designed for the specific network typologies (lack of generality) or rely on the applications' communication patterns (unknown without running on HPC). In this paper, we propose a generic topology-aware node allocation strategy based on graph algorithms. Our strategy can reduce the intra-job communication overhead and the inter-job communication interference by selecting nodes that form a sub-graph with much smaller diameter. We also propose and study four different initialization strategies for our node allocation algorithm to understand how different initialization strategies affect the node allocation results and speed. We evaluate the proposed methods using 30 days of real job traces collected from the OLCF's Titan supercomputer. Compared to the native job scheduling strategy used on Titan, adopting our approach can achieve a 2.5 × diameter reduction on average, and for certain jobs the diameter reduction can be up to 8 ×.",
keywords = "Graph Analytics, HPC, job scheduling, topology-aware allocation",
author = "Andy Trinh and Shivam Sheth and Anil Gaihre and Caiwen Ding and Jieyang Chen and Feiyi Wang and David Pugmire and Scott Klasky and Hang Liu and Lipeng Wan",
note = "Publisher Copyright: {\textcopyright} 2023 IEEE.; 25th IEEE International Conferences on High Performance Computing and Communications, 9th International Conference on Data Science and Systems, 21st IEEE International Conference on Smart City and 9th IEEE International Conference on Dependability in Sensor, Cloud and Big Data Systems and Applications, HPCC/DSS/SmartCity/DependSys 2023 ; Conference date: 13-12-2023 Through 15-12-2023",
year = "2023",
doi = "10.1109/HPCC-DSS-SmartCity-DependSys60770.2023.00113",
language = "English",
series = "Proceedings - 2023 IEEE International Conference on High Performance Computing and Communications, Data Science and Systems, Smart City and Dependability in Sensor, Cloud and Big Data Systems and Application, HPCC/DSS/SmartCity/DependSys 2023",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "780--787",
editor = "Jinjun Chen and Yang, \{Laurence T.\}",
booktitle = "Proceedings - 2023 IEEE International Conference on High Performance Computing and Communications, Data Science and Systems, Smart City and Dependability in Sensor, Cloud and Big Data Systems and Application, HPCC/DSS/SmartCity/DependSys 2023",
}