@inproceedings{dcfc7c995a07492995d4432a29a714ff,
title = "CPU-GPU hybrid bidiagonal reduction with soft error resilience",
abstract = "Soft errors pose a real challenge to applications running on modern hardware as the feature size becomes smaller and the integration density increases for both the modern processors and the memory chips. Soft errors manifest themselves as bit-flips that alter the user value, and numerical software is a category of software that is sensitive to such data changes. In this paper, we present a design of a bidiagonal reduction algorithm that is resilient to soft errors, and we also describe its implementation on hybrid CPU-GPU architectures. Our fault-tolerant algorithm employs Algorithm Based Fault Tolerance, combined with reverse computation, to detect, locate, and correct soft errors. The tests were performed on a Sandy Bridge CPU coupled with an NVIDIA Kepler GPU. The included experiments show that our resilient bidiagonal reduction algorithm adds very little overhead compared to the error-prone code. At matrix size 10110 × 10110, our algorithm only has a performance overhead of 1:085% when one error occurs, and 0:354% when no errors occur. Copyright is held by the owner/author(s).",
keywords = "ABFT, Bidiagonalization, GPU, Hybrid, Resilient, Reverse computation, Soft error",
author = "Yulu Jia and Piotr Luszczek and George Bosilca and Dongarra, {Jack J.}",
year = "2013",
doi = "10.1145/2530268.2530270",
language = "English",
isbn = "9781450325080",
series = "Proc. of ScalA 2013: Workshop on Latest Adv. in Scalable Algorithms for Large-Scale Systems - Held in Conjunction with SC 2013: The Int. Conf. for High Perform. Comput., Networking, Storage and Anal.",
booktitle = "Proc. of ScalA 2013",
note = "Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems, ScalA 2013 - Held in Conjunction with the International Conference for High Performance Computing, Networking, Storage and Analysis, SC 2013 ; Conference date: 17-11-2013 Through 21-11-2013",
}