@inproceedings{8f35fa99def448e088789eb99be8fb4c,
title = "RDPM: An Extensible Tool for Resilience Design Patterns Modelling",
abstract = "Resilience to faults, errors, and failures in extreme-scale high-performance computing (HPC) systems is a critical challenge. Resilience design patterns offer a new, structured hardware and software design approach for improving resilience. While prior work focused on developing performance, reliability, and availability models for resilience design patterns, this paper extends it by providing a Resilience Design Patterns Modeling (RDPM) tool which allows (1) exploring performance, reliability, and availability of each resilience design pattern, (2) offering customization of parameters to optimize performance, reliability, and availability, and (3) allowing investigation of trade-off models for combining multiple patterns for practical resilience solutions.",
keywords = "Design patterns, High-performance computing, Resilience, Tool",
author = "Mohit Kumar and Christian Engelmann",
note = "Publisher Copyright: {\textcopyright} 2022, Springer Nature Switzerland AG.; 27th International Conference on Parallel and Distributed Computing, Euro-Par 2021 ; Conference date: 30-08-2021 Through 31-08-2021",
year = "2022",
doi = "10.1007/978-3-031-06156-1_23",
language = "English",
isbn = "9783031061554",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Science and Business Media Deutschland GmbH",
pages = "283--297",
editor = "Ricardo Chaves and {B. Heras}, Dora and Aleksandar Ilic and Didem Unat and Badia, {Rosa M.} and Andrea Bracciali and Patrick Diehl and Anshu Dubey and Oh Sangyoon and {L. Scott}, Stephen and Laura Ricci",
booktitle = "Euro-Par 2021",
}