@inproceedings{7b09c5f4cfae4e0fbe228082a8676d84,
title = "Asking the right questions: Benchmarking fault-tolerant extreme-scale systems",
abstract = "Much recent research has explored fault-tolerance mechanisms intended for current and future extreme-scale systems. Evaluations of the suitability of checkpoint-based solutions have typically been carried out using relatively uncomplicated computational kernels designed to measure floating point performance. More recent investigations have added scaled-down {"}proxy{"} applications to more closely match the composition and behavior of deployed ones. However, the information obtained from these studies (whether floating point performance or application runtime) is not necessarily of the most value in evaluating resilience strategies. We observe that even when using a more sophisticated metric, the information available from evaluating uncoordinated checkpointing using both microbenchmarks and proxy applications does not agree. This implies that not only might researchers be asking the wrong questions, but that the answers to the right ones might be unexpected and potentially misleading. We seek to open a discussion on whether benchmarks designed to provide predictable performance evaluations of HPC hardware and toolchains are providing the right feedback for the evaluation of fault-tolerance in these applications, and more generally on how benchmarking of resilience mechanisms ought to be approached in the exascale design space.",
author = "Widener, {Patrick M.} and Ferreira, {Kurt B.} and Scott Levy and Bridges, {Patrick G.} and Dorian Arnold and Ron Brightwell",
year = "2014",
doi = "10.1007/978-3-642-54420-0_70",
language = "English",
isbn = "9783642544194",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Verlag",
pages = "717--726",
booktitle = "Euro-Par 2013",
note = "19th International Conference on Parallel Processing Workshops, Euro-Par 2013 - BigDataCloud, DIHC, FedICI, HeteroPar, HiBB, LSDVE, MHPC, OMHI, PADABS, PROPER, Resilience, ROME, and UCHPC 2013 ; Conference date: 26-08-2013 Through 27-08-2013",
}