@inproceedings{eade46357373416594453ac366049a4a,
title = "Evaluating the Performance of Integer Sum Reduction in SYCL on GPUs",
abstract = "SYCL is a promising programming model for heterogeneous computing - allowing a single-source code to target devices from multiple vendors. One significant task performed on these accelerators is a primitive operation for integer sum reduction. This paper presents several SYCL implementations of integer sum reduction - using atomic functions, shared local memory, vectorized memory accesses and parameterized workload sizes - to compare the performance and maturity of SYCL against open-source vendor-specific implementations of the same reduction. For a sufficiently large number of integers, tuning the parameters of our SYCL implementations achieves 1.4X speedup over the open-source implementations on an Intel UHD630 integrated GPU. The SYCL reduction is 3% faster than the templated reduction in Thrust, and 0.3% faster than the device reduction in CUB on an Nvidia P100 GPU. The SYCL reduction is 1.9% faster than the templated reduction in Thrust, and 0.4% faster than the device reduction in CUB on an Nvidia V100 GPU.",
keywords = "Cuda, Gpgpu, Opencl, Reduction, Sycl",
author = "Zheming Jin and Jeffrey Vetter",
note = "Publisher Copyright: {\textcopyright} 2021 ACM.; 50th International Conference on Parallel Processing Workshop, ICPP 2021 ; Conference date: 09-08-2021 Through 12-08-2021",
year = "2021",
month = aug,
day = "9",
doi = "10.1145/3458744.3473360",
language = "English",
series = "ACM International Conference Proceeding Series",
publisher = "Association for Computing Machinery",
booktitle = "50th International Conference on Parallel Processing Workshop, ICPP 2021 - Proceedings",
}