@inproceedings{91c3a20f6cd04040bdd7788ad728009a,
title = "Soft error resilient QR factorization for hybrid system with GPGPU",
abstract = "The general purpose graphics processing units (GPGPU) are increasingly deployed for scientific computing due to their performance advantages over CPUs. As a result, fault tolerance has become a more serious concern compared to the period when GPGPUs were used exclusively for graphics applications. Using GPUs and CPUs together in a hybrid computing system increases flexibility and performance but also increases the possibility of the computations being affected by soft errors. In this work, we propose a soft error resilient algorithm for QR factorization on such hybrid systems. Our contributions include (1) a checkpointing and recovery mechanism for the left-factor Q whose performance is scalable on hybrid systems; (2) optimized Givens rotation utilities on GPGPUs to efficiently reduce an upper Hessenberg matrix to an upper triangular form for the protection of the right factor R, and (3) a recovery algorithm based on QR update on GPGPUs. Experimental results show that our fault tolerant QR factorization can success- fully detect and recover from soft errors in the entire matrix with little overhead on hybrid systems with GPGPUs.",
keywords = "GPGPU, QR factorization, soft error",
author = "Peng Du and Piotr Luszczek and Stan Tomov and Jack Dongarra",
year = "2011",
doi = "10.1145/2133173.2133179",
language = "English",
isbn = "9781450311809",
series = "ScalA'11 - Proceedings of the 2011 ACM Workshop on Scalable Algorithms for Large-Scale Systems, Co-located with SC'11",
pages = "11--14",
booktitle = "ScalA'11 - Proceedings of the 2011 ACM Workshop on Scalable Algorithms for Large-Scale Systems, Co-located with SC'11",
note = "2011 ACM Workshop on Scalable Algorithms for Large-Scale Systems, ScalA'11, Co-located with SC'11 ; Conference date: 14-11-2011 Through 14-11-2011",
}