@inproceedings{1f975c3b9a2744078a05f46f49e81bf2,
title = "Do moldable applications perform better on failure-prone HPC platforms?",
abstract = "This paper compares the performance of different approaches to tolerate failures using checkpoint/restart when executed on large-scale failure-prone platforms. We study (i) RIGID applications, which use a constant number of processors throughout execution; (ii) MOLDABLE applications, which can use a different number of processors after each restart following a fail-stop error; and (iii) GRIDSHAPED applications, which are moldable applications restricted to use rectangular processor grids (such as many dense linear algebra kernels). For each application type, we compute the optimal number of failures to tolerate before relinquishing the current allocation and waiting until a new resource can be allocated, and we determine the optimal yield that can be achieved. We instantiate our performance model with a realistic applicative scenario and make it publicly available for further usage.",
keywords = "Allocation length, Checkpoint, Moldable applications, Resilience, Restart, Spare nodes, Wait time",
author = "{Le F{\`e}vre}, Valentin and George Bosilca and Aurelien Bouteiller and Thomas Herault and Atsushi Hori and Yves Robert and Jack Dongarra",
note = "Publisher Copyright: {\textcopyright} Springer Nature Switzerland AG 2019.; 24th International Conference on Parallel and Distributed Computing, Euro-Par 2018 ; Conference date: 27-08-2018 Through 28-08-2018",
year = "2019",
doi = "10.1007/978-3-030-10549-5_61",
language = "English",
isbn = "9783030105488",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Verlag",
pages = "787--799",
editor = "Gabriele Mencagli and Heras, {Dora B.}",
booktitle = "Euro-Par 2018",
}