@inproceedings{f8cd669842aa4b84837c0c191b105cba,
title = "Variable Batched DGEMM",
abstract = "Many scientific applications are in need to solve a high number of small-size independent problems. These individual problems do not provide enough parallelism and then, these must be computed as a batch. Today, vendors such as Intel and NVIDIA are developing their own suite of batch routines. Although most of the works focus on computing batches of fixed size, in real applications we can not assume a uniform size for all set of problems. We explore and analyze different strategies based on parallel for, task and taskloop OpenMP pragmas. Although these strategies are straightforward from a programmer's point of view, they have a different impact on performance. We also analyze a new prototype provided by Intel (MKL), which deals with batch operations (cblas dgemm batch). We propose a new approach called grouping. It basically groups a set of problems until filling a limit in terms of memory occupancy or number of operations. In this way, groups composed by different number of problems are distributed on cores, achieving a more balanced distribution in terms of computational cost. This strategy is able to be up to 6× faster than the Intel (MKL) batch routine.",
keywords = "Auto tunning, Batched BLAS, DGEMM, Intel Xeon, OpenMP, Runtime",
author = "Pedro Valero-Lara and Ivan Martinez-Perez and Sergi Mateo and Raul Sirvent and Vicenc Beltran and Xavier Martorell and Jesus Labarta",
note = "Publisher Copyright: {\textcopyright} 2018 IEEE.; 26th Euromicro International Conference on Parallel, Distributed, and Network-Based Processing, PDP 2018 ; Conference date: 21-03-2018 Through 23-03-2018",
year = "2018",
month = jun,
day = "6",
doi = "10.1109/PDP2018.2018.00065",
language = "English",
series = "Proceedings - 26th Euromicro International Conference on Parallel, Distributed, and Network-Based Processing, PDP 2018",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "363--367",
editor = "Igor Kotenko and Ivan Merelli and Pietro Lio",
booktitle = "Proceedings - 26th Euromicro International Conference on Parallel, Distributed, and Network-Based Processing, PDP 2018",
}