@inproceedings{ac1354393fb645ec8bd597512f7e7210,
title = "Optimizing an atomics-based reduction Kernel on OpenCL FPGA platform",
abstract = "Field-programmable gate arrays (FPGAs) are becoming one of heterogeneous computing components in highperformance computing. To facilitate the use of FPGAs for developers and researchers, high-level synthesis tools are pushing the FPGA-based design abstraction from the register-transfer level to high-level language design flow using OpenCL/C/C++. Currently, there are few studies on the atomic functions in the OpenCL-based design flow on an FPGA. In this paper, we evaluate the performance of atomic functions using a reduction kernel on an OpenCL FPGA platform as a case study. We describe the implementations of an integer sum-reduction kernel in OpenCL, and perform the optimizations of memory accesses. Fully utilizing the bandwidth of the data bus can bring a factor of 15 improvement over the baseline kernel. The performance speedup of the kernel using local memory for atomic operations is 6.8X over the na{\"i}ve kernel using global memory. The combination of both optimizations can lead to 112X speedup. Compute unit duplication can be applied to the kernel to further improve the performance by a factor of 2.9.",
keywords = "Atomics, FPGA, OpenCL, Reductions",
author = "Zheming Jin and Hal Finkel",
note = "Publisher Copyright: {\textcopyright} 2018 IEEE.; 32nd IEEE International Parallel and Distributed Processing Symposium Workshops, IPDPSW 2018 ; Conference date: 21-05-2018 Through 25-05-2018",
year = "2018",
month = aug,
day = "3",
doi = "10.1109/IPDPSW.2018.00092",
language = "English",
isbn = "9781538655559",
series = "Proceedings - 2018 IEEE 32nd International Parallel and Distributed Processing Symposium Workshops, IPDPSW 2018",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "532--539",
booktitle = "Proceedings - 2018 IEEE 32nd International Parallel and Distributed Processing Symposium Workshops, IPDPSW 2018",
}