@inproceedings{4b6b623755c44ca38ce90c78a975bc96,
title = "Landrush: Rethinking In-Situ Analysis for GPGPU Workflows",
abstract = "In-situ analysis on the output data of scientific simulations has been made necessary by ever-growing output data volumes and increasing costs of data movement as supercomputing is moving towards exascale. With hardware accelerators like GPUs becoming increasingly common in high end machines, new opportunities arise to co-locate scientific simulations and online analysis performed on the scientific data generated by the simulations. However, the asynchronous nature of GPGPU programming models and the limited context-switching capabilities on the GPU pose challenges to co-locating the scientific simulation and analysis on the same GPU. This paper dives deeper into these challenges to understand how best to co-locate analysis with scientific simulations on the GPUs in HPC clusters. Specifically, our 'Landrush' approach to GPU sharing proposes a solution that utilizes idle cycles on the GPU to provide an improved time-to-answer, that is, the total time to run the scientific simulation and analysis of the generated data. Landrush is demonstrated with experimental results obtained from leadership high-end applications on ORNL's Titan supercomputer, which show that (i) GPU-based scientific simulations have varying degrees of idle cycles to afford useful analysis task co-location, and (ii) the inability to context switch on the GPU at instruction granularity can be overcome by careful control of the analysis kernel launches and software-controlled early completion of analysis kernel executions. Results show that Landrush is superior in terms of time-to-answer compared to serially running simulations followed by analysis or by relying on the GPU driver and hardwired thread dispatcher to run analysis concurrently on a single GPU.",
keywords = "GPU workflow, In-situ analysis, instrumentation, runtime scheduler",
author = "Anshuman Goswami and Yuan Tian and Karsten Schwan and Fang Zheng and Jeffrey Young and Matthew Wolf and Greg Eisenhauer and Scott Klasky",
note = "Publisher Copyright: {\textcopyright} 2016 IEEE.; 16th IEEE/ACM International Symposium on Cluster, Cloud, and Grid Computing, CCGrid 2016 ; Conference date: 16-05-2016 Through 19-05-2016",
year = "2016",
month = jul,
day = "18",
doi = "10.1109/CCGrid.2016.58",
language = "English",
series = "Proceedings - 2016 16th IEEE/ACM International Symposium on Cluster, Cloud, and Grid Computing, CCGrid 2016",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "32--41",
booktitle = "Proceedings - 2016 16th IEEE/ACM International Symposium on Cluster, Cloud, and Grid Computing, CCGrid 2016",
}