@inproceedings{d29d28563af348c998c5adcfaa945c28,
title = "Check-pointing approach for fault tolerance in OpenSHMEM",
abstract = "Fault tolerance for long running applications is critical to guard against failure of either compute resources or a network. Accomplishing this task in software is non-trivial and there is an added level of complexity for implementing a working model for a one-sided communications library like OpenSHMEM, since there is no matching communication call at the target processing element (PE). In this paper we explore a fault tolerance scheme based on check-point and restart, that caters to the one-sided nature of PGAS programming model while leveraging features very specific to OpenSHMEM. Through a working implementation with the 1-D Jacobi code, we show that the approach is scalable and provides considerable computational resource saving.",
author = "Pengfei Hao and Swaroop Pophale and Pavel Shamis and Tony Curtis and Barbara Chapman",
note = "Publisher Copyright: {\textcopyright} Springer International Publishing Switzerland 2015.; 2nd Workshop on OpenSHMEM and Related Technologies, OpenSHMEM 2015 ; Conference date: 04-08-2015 Through 06-08-2015",
year = "2015",
doi = "10.1007/978-3-319-26428-8_3",
language = "English",
isbn = "9783319264271",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Verlag",
pages = "36--52",
editor = "Venkata, {Manjunath Gorentla} and Pavel Shamis and Neena Imam and Lopez, {M. Graham}",
booktitle = "OpenSHMEM and Related Technologies",
}