@inproceedings{e128fb2e2a224e4b9ae9443ce109ca9a,
title = "Horseshoes and hand grenades: The case for approximate coordination in local checkpointing protocols",
abstract = "Fault-tolerance poses a major challenge for future large-scale systems. Active research into coordinated, uncoordinated, and hybrid checkpointing systems has explored how the introduction of asynchrony can address anticipated scalability issues. While fully uncoordinated approaches have been shown to have significant delays, the degree of sychronization required to keep overheads low has not yet been significantly addressed. In this paper, we use a simulation-based approach to show the impact of synchronization on local checkpoint activity. Specifically, we show the degree of synchronization needed to keep the impacts of local checkpointing low is attainable with current technology for a number of key production HPC workloads. Our work provides a critical analysis and comparison of synchronization and local checkpointing. This enables users and system administrators to fine-tune the checkpointing scheme to the application and system characteristics available.",
author = "Widener, {Patrick M.} and Ferreira, {Kurt B.} and Scott Levy",
note = "Publisher Copyright: {\textcopyright} Springer International Publishing AG 2017.; 22nd International Conference on Parallel and Distributed Computing, Euro-Par 2016 ; Conference date: 24-08-2016 Through 26-08-2016",
year = "2017",
doi = "10.1007/978-3-319-58943-5_50",
language = "English",
isbn = "9783319589428",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Verlag",
pages = "623--634",
editor = "Pierre-Francois Dutot and Frederic Desprez",
booktitle = "Euro-Par 2016",
}