@inproceedings{8b4e62629b9940f1b74dd22b70d094f8,
title = "Fault tolerant MPI for the HARNESS Meta-computing system",
abstract = "Initial versions of MPI were designed to work efficiently on multiprocessors which had very little job control and thus static process models. Subsequently forcing them to support a dynamic process model suitable for use on clusters or distributed systems would have reduced their performance. As current HPC collaborative applications increase in size and distribution the potential levels of node and network failures increase the need arises for new fault tolerant systems to be developed. Here we present a new implementation of MPI called FT-MPI that allows the semantics and associated modes of failures to be explicitly controlled by an application via a modified MPI API. Given is an overview of the FT-MPI semantics, design, example applications and some performance issues such as efficient group communications and complex data handling.",
author = "Fagg, {Graham E.} and Antonin Bukovsky and Dongarra, {Jack J.}",
note = "Publisher Copyright: {\textcopyright} Springer-Verlag Berlin Heidelberg 2001.; International Conference on Computational Science, ICCS 2001 ; Conference date: 28-05-2001 Through 30-05-2001",
year = "2001",
doi = "10.1007/3-540-45545-0_44",
language = "English",
isbn = "3540422323",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Verlag",
pages = "355--366",
editor = "Alexandrov, {Vassil N.} and Dongarra, {Jack J.} and Juliano, {Benjoe A.} and Renner, {Ren{\'e} S.} and {Kenneth Tan}, C.J.",
booktitle = "Computational Science - ICCS 2001 - International Conference, 2001, Proceedings",
}