@inproceedings{a900bec1633e4dc2a45e5c3c9cb4bfa3,
title = "It{\textquoteright}s not the heat, it{\textquoteright}s the humidity: Scheduling resilience activity at scale",
abstract = "Maintaining the performance of high-performance computing (HPC) applications with the expected increase in failures is a major challenge for next-generation extreme-scale systems. With increasing scale, resilience activities (e.g. checkpointing) are expected to become more diverse, less tightly synchronized, and more computationally intensive. Few existing studies, however, have examined how decisions about scheduling resilience activities impact application performance. In this work, we examine the relationship between the duration and frequency of resilience activities and application performance. Our study reveals several key findings: (i) the aggregate amount of time consumed by resilience activities is not an effective metric for predicting application performance; (ii) the duration of the interruptions due to resilience activities has the greatest influence on application performance; shorter, but more frequent, interruptions are correlated with better application performance; and (iii) the differential impact of resilience activities across applications is related to the applications{\textquoteright} inter-collective frequencies; the performance of applications that perform infrequent collective operations scales better in the presence of resilience activities than the performance of applications that perform more frequent collective operations. This initial study demonstrates the importance of considering how resilience activities are scheduled. We provide critical analysis and direct guidance on how the resilience challenges of future systems can be met while minimizing the impact on application performance.",
keywords = "Collectives, Performance, Resilience, Scheduling",
author = "Widener, {Patrick M.} and Ferreira, {Kurt B.} and Scott Levy",
note = "Publisher Copyright: {\textcopyright} Springer International Publishing AG, part of Springer Nature 2018.; International Workshops on Parallel Processing, Euro-Par 2017 ; Conference date: 28-08-2017 Through 29-08-2017",
year = "2018",
doi = "10.1007/978-3-319-75178-8_47",
language = "English",
isbn = "9783319751771",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Verlag",
pages = "581--592",
editor = "Heras, {Dora B.} and Luc Bouge",
booktitle = "Euro-Par 2017",
}