@inproceedings{68de5440c287430c96cf42bda2501684,
title = "Lessons Learned from Optimizing Kernels for Adaptive Aggregation Multi-grid Solvers in Lattice QCD",
abstract = "In recent years, adaptive aggregation multi-grid (AAMG) methods have become the gold standard for solving the Dirac equation in Lattice QCD (LQCD) using Wilson-Clover fermions. These methods are able to overcome the critical slowing down as quark masses approach their physical values and are thus the go-to method for performing Lattice QCD calculations at realistic physical parameters. In this paper we discuss the optimization of a specific building block for implementing AAMG for Wilson-Clover fermions from LQCD, known as the coarse restrictor operator, on contemporary Intel processors featuring large SIMD widths and high thread counts. We will discuss in detail the efficient use of OpenMP and Intel vector intrinsics in our attempts to exploit fine grained parallelism on the coarsest levels. We present performance optimizations and discuss the ramifications for implementing a full AAMG stack on Intel Xeon Phi Knights Landing and Skylake processors.",
author = "B{\'a}lint Jo{\'o} and Thorsten Kurth",
note = "Publisher Copyright: {\textcopyright} 2018, Springer Nature Switzerland AG.; International Conference on High Performance Computing, ISC High Performance 2018 ; Conference date: 28-06-2018 Through 28-06-2018",
year = "2018",
doi = "10.1007/978-3-030-02465-9_34",
language = "English",
isbn = "9783030024642",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Verlag",
pages = "472--486",
editor = "Rio Yokota and Mich{\`e}le Weiland and Sadaf Alam and John Shalf",
booktitle = "High Performance Computing - ISC High Performance 2018 International Workshops, Revised Selected Papers",
}