@inproceedings{934cb53c35854fe58a13a4d79227b4a4,
title = "Fault tolerance in message passing and in action",
abstract = "This talk will describe an implementation of MPI which extends the message passing model to allow for recovery in the presence of a faulty process. Our implementation allows a user to catch the fault and then provide for a recovery. We will also touch on the issues related to using diskless checkpointing to allow for effective recovery of an application in the presence of a process fault.",
author = "Dongarra, {Jack J.}",
note = "Publisher Copyright: {\textcopyright} Springer-Verlag Berlin Heidelberg 2004.; 11th European Conference on Parallel Virtual Machine and Message Passing Interface Users Group Meeting, PVM/MPI 2004 ; Conference date: 19-09-2004 Through 22-09-2004",
year = "2004",
doi = "10.1007/978-3-540-30218-6_3",
language = "English",
isbn = "3540231633",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Verlag",
pages = "6",
editor = "Dieter Kranzlmuller and Peter Kacsuk and Jack Dongarra",
booktitle = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
}