@inproceedings{81afee35d6164e18952ed03d719d4ed7,
title = "Job Management with mpi_jm",
abstract = "Access to Leadership computing is required for HPC applications that require a large fraction of compute nodes for a single computation and also for use cases where the volume of smaller tasks can only be completed in a competitive or reasonable time frame through use of these Leadership computing facilities. In the latter case, a robust and lightweight manager is ideal so that all these tasks can be computed in a machine-friendly way, notably with minimal use of mpirun or equivalent to launch the executables (simple bundling of tasks can over-tax the service nodes and crash the entire scheduler). Our library, mpi_jm, can manage such allocations, provided access to the requisite MPI functionality is provided. mpi_jm is fault-tolerant against a modest number of down or non-communicative nodes, can begin executing work on smaller portions of a larger allocation before all nodes become available for the allocation, can manage GPU-intensive and CPU-only work independently and can overlay them peacefully on shared nodes. It is easily incorporated into existing MPI-capable executables, which then can run both independently and under mpi_jm management. It provides a flexible Python interface, unlocking many high-level libraries, while also tightly binding users{\textquoteright} executables to hardware.",
keywords = "CORAL, Job management, Pilot systems",
author = "Evan Berkowitz and Gustav Jansen and Kenneth McElvain and Andr{\'e} Walker-Loud",
note = "Publisher Copyright: {\textcopyright} 2018, Springer Nature Switzerland AG.; International Conference on High Performance Computing, ISC High Performance 2018 ; Conference date: 28-06-2018 Through 28-06-2018",
year = "2018",
doi = "10.1007/978-3-030-02465-9_30",
language = "English",
isbn = "9783030024642",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Verlag",
pages = "432--439",
editor = "Rio Yokota and John Shalf and Sadaf Alam and Mich{\`e}le Weiland",
booktitle = "High Performance Computing - ISC High Performance 2018 International Workshops, Revised Selected Papers",
}