@inproceedings{3a743633ab4a4d98bfd4e97ad8975c0a,
title = "A framework for lattice QCD calculations on GPUs",
abstract = "Computing platforms equipped with accelerators like GPUs have proven to provide great computational power. However, exploiting such platforms for existing scientific applications is not a trivial task. Current GPU programming frameworks such as CUDA C/C++ require low-level programming from the developer in order to achieve high performance code. As a result porting of applications to GPUs is typically limited to time-dominant algorithms and routines, leaving the remainder not accelerated which can open a serious Amdahl's law issue. The Lattice QCD application Chroma allows us to explore a different porting strategy. The layered structure of the software architecture logically separates the data-parallel from the application layer. The QCD Data-Parallel software layer provides data types and expressions with stencil-like operations suitable for lattice field theory. Chroma implements algorithms in terms of this high-level interface. Thus by porting the low-level layer one effectively ports the whole application layer in one swing. The QDP-JIT/PTX library, our reimplementation of the low-level layer, provides a framework for Lattice QCD calculations for the CUDA architecture. The complete software interface is supported and thus applications can be run unaltered on GPU-based parallel computers. This reimplementation was possible due to the availability of a JIT compiler which translates an assembly language (PTX) to GPU code. The existing expression templates enabled us to employ compile-time computations in order to build code generators and to automate the memory management for CUDA. Our implementation has allowed us to deploy the full Chroma gauge-generation program on large scale GPU-based machines such as Titan and Blue Waters and accelerate the calculation by more than an order of magnitude.",
keywords = "Application framework, CUDA, GPU, JIT, Lattice QCD, PTX",
author = "Winter, \{F. T.\} and Clark, \{M. A.\} and Edwards, \{R. G.\} and B. Jo{\'o}",
year = "2014",
doi = "10.1109/IPDPS.2014.112",
language = "English",
isbn = "9780769552071",
series = "Proceedings of the International Parallel and Distributed Processing Symposium, IPDPS",
publisher = "IEEE Computer Society",
pages = "1073--1082",
booktitle = "Proceedings - IEEE 28th International Parallel and Distributed Processing Symposium, IPDPS 2014",
note = "28th IEEE International Parallel and Distributed Processing Symposium, IPDPS 2014 ; Conference date: 19-05-2014 Through 23-05-2014",
}