@inproceedings{329c71396f6f433ea3c3d4afe442dda4,
title = "A distributed CPU-GPU sparse direct solver",
abstract = "This paper presents the first hybrid MPI+OpenMP+CUDA implementation of a distributed memory right-looking unsymmetric sparse direct solver (i.e., sparse LU factorization) that uses static pivoting. While BLAS calls can account for more than 40% of the overall factorization time, the difficulty is that small problem sizes dominate the workload, making efficient GPU utilization challenging. This fact motivates our approach, which is to find ways to aggregate collections of small BLAS operations into larger ones; to schedule operations to achieve load balance and hide long-latency operations, such as PCIe transfer; and to exploit simultaneously all of a node's available CPU cores and GPUs.",
author = "Piyush Sao and Richard Vuduc and Li, {Xiaoye Sherry}",
year = "2014",
doi = "10.1007/978-3-319-09873-9_41",
language = "English",
isbn = "9783319098722",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Verlag",
pages = "487--498",
editor = "Fernando Silva and In{\^e}s Dutra and Costa, {V{\'i}tor Santos}",
booktitle = "Euro-Par 2014",
note = "20th International Conference on Parallel Processing, Euro-Par 2014 ; Conference date: 25-08-2014 Through 29-08-2014",
}