@inproceedings{28bcd7a6a3da4ede8f2aa97fbec5960b,
title = "Job and data clustering for aggregate use of multiple production cyberinfrastructures",
abstract = "In this paper, we address the challenges of reducing the time-to-solution of the data intensive earthquake simulation workflow {"}CyberShake{"} by supplementing the high-performance parallel computing (HPC) resources on which it typically runs with distributed, heterogeneous resources that can be obtained opportunistically from grids and clouds. We seek to minimize time to solution by maximizing the amount of work that can be efficiently done on the distributed resources. We identify data movement as the main bottleneck in effectively utilizing the combined local and distributed resources. We address this by analyzing the I/O characteristics of the application, processor acquisition rate (from a pilot-job service), and the data movement throughput of the infrastructure. With these factors in mind, we explore a combination of strategies including partitioning of computation (over HPC and distributed resources) and job clustering. We validate our approach with a theoretical study and with preliminary measurements on the Ranger HPC system and distributed Open Science Grid resources. More complete performance results will be presented in the final submission of this paper.",
keywords = "Hpc, Parallel, Scec, Swift",
author = "Ketan Maheshwari and Allan Espinosa and Katz, {Daniel S.} and Michael Wilde and Zhao Zhang and Ian Foster and Scott Callaghan and Phillip Maechling",
year = "2012",
doi = "10.1145/2286996.2287000",
language = "English",
isbn = "9781450313414",
series = "DIDC'12 - 5th International Workshop on Data-Intensive Distributed Computing",
pages = "3--11",
booktitle = "DIDC'12 - 5th International Workshop on Data-Intensive Distributed Computing",
note = "5th International Workshop on Data-Intensive Distributed Computing, DIDC'12 ; Conference date: 19-06-2012 Through 19-06-2012",
}