@inproceedings{f39e0345b2744d7f8848eb380661db9a,
title = "Constructing resiliant communication infrastructure for runtime environments",
abstract = "High performance computing platforms are becoming larger, leading to scalability and fault-tolerance issues for both applications and runtime environments (RTE) dedicated to run on such machines. After being deployed, usually following a spanning tree, a RTE needs to build its own communication infrastructure to manage and monitor the tasks of parallel applications. Previous works have demonstrated that the Binomial Graph topology (BMG) is a good candidate as a communication infrastructure for supporting scalable and fault-tolerant RTE. In this paper, we present and analyze a self-stabilizing algorithm to transform the underlying communication infrastructure provided by the launching service into a BMG, and maintain it in spite of failures. We demonstrate that this algorithm is scalable, tolerates transient failures, and adapts itself to topology changes.",
keywords = "Self-stabilization, binomial graph, scalability",
author = "George Bosilca and Camille Coti and Thomas Herault and Pierre Lemarinier and Jack Dongarra",
year = "2010",
doi = "10.3233/978-1-60750-530-3-441",
language = "English",
isbn = "9781607505297",
series = "Advances in Parallel Computing",
publisher = "IOS Press BV",
pages = "441--451",
booktitle = "Parallel Computing",
}