@inproceedings{12f30facc3864e139d8bb54b73e16a2c,
title = "High-performance Matrix-Matrix multiplications of very small matrices",
abstract = "The use of the general dense matrix-matrix multiplication (GEMM) is fundamental for obtaining high performance in many scientific computing applications. GEMMs for small matrices (of sizes less than 32) however, are not sufficiently optimized in existing libraries. In this paper we consider the case of many small GEMMs on either CPU or GPU architectures. This is a case that often occurs in applications like big data analytics, machine learning, high-order FEM, and others. The GEMMs are grouped together in a single batched routine. We present specialized for these cases algorithms and optimization techniques to obtain performance that is within 90% of the optimal. We show that these results outperform currently available state-of-the-art implementations and vendor-tuned math libraries.",
keywords = "Autotuning, Batched GEMM, GEMM, HPC, Small matrices",
author = "Ian Masliah and Ahmad Abdelfattah and A. Haidar and S. Tomov and Marc Baboulin and J. Falcou and J. Dongarra",
note = "Publisher Copyright: {\textcopyright} Springer International Publishing Switzerland 2016.; 22nd International Conference on Parallel and Distributed Computing, Euro-Par 2016 ; Conference date: 24-08-2016 Through 26-08-2016",
year = "2016",
doi = "10.1007/978-3-319-43659-3_48",
language = "English",
isbn = "9783319436586",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Verlag",
pages = "659--671",
editor = "Pierre-Fran{\c c}ois Dutot and Denis Trystram",
booktitle = "Parallel Processing - 22nd International Conference on Parallel and Distributed Computing, Euro-Par 2016, Proceedings",
}