@inproceedings{d6fbe0c171af4e83b8f6056814c41d37,
title = "A log-scaling fault tolerant agreement algorithm for a fault tolerant MPI",
abstract = "The lack of fault tolerance is becoming a limiting factor for application scalability in HPC systems. The MPI does not provide standardized fault tolerance interfaces and semantics. The MPI Forum's Fault Tolerance Working Group is proposing a collective fault tolerant agreement algorithm for the next MPI standard. Such algorithms play a central role in many fault tolerant applications. This paper combines a log-scaling two-phase commit agreement algorithm with a reduction operation to provide the necessary functionality for the new collective without any additional messages. Error handling mechanisms are described that preserve the fault tolerance properties while maintaining overall scalability.",
keywords = "Agreement Protocol, Algorithm Based Fault Tolerance, Fault Tolerance, MPI, Run-through Stabilization",
author = "Joshua Hursey and Thomas Naughton and Geoffroy Vallee and Graham, {Richard L.}",
year = "2011",
doi = "10.1007/978-3-642-24449-0_29",
language = "English",
isbn = "9783642244483",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
pages = "255--263",
booktitle = "Recent Advances in the Message Passing Interface - 18th European MPI Users' Group Meeting, EuroMPI 2011, Proceedings",
note = "18th European Message Passing Interface Users' Group Meeting, EuroMPI 2011 ; Conference date: 18-09-2011 Through 21-09-2011",
}