@inproceedings{ddcfc78ac28148daa71f178e9884eaaf,
title = "BAD-Check§: Bulk Asynchronous Distributed Checkpointing",
abstract = "Leadership-scale scientific simulations running as tens of thousands of tightly-coupled MPI processes are vulnerable to interruption due to a single process or node failure. Due to the dependence of each state calculation on the successful completion of each of the prior state calculations, checkpoint restart is the most widely-used technique to achieve fault tolerance. To write a consistent view of distributed state as a checkpoint, applications typically synchronize and pause while writing data to persistent media. In this paper we present a transactional protocol that enables asynchronous distributed creation of checkpoint data sets, and describe the conditions under which it is beneficial. With simulations, we demonstrate that scientific applications exhibiting computational variance without frequent synchronization can use our protocol to either reduce run time by up to 27\% or reduce required storage system capability by up to 40\%.",
author = "John Bent and Brad Settlemyer and Haiyun Bao and Sorin Faibish and Jeremy Sauer and Jingwang Zhang",
note = "Publisher Copyright: {\textcopyright} 2015 ACM.; 10th Parallel Data Storage Workshop, PDSW 2015 - Held as part of the 27th ACM/IEEE International Conference for High Performance Computing, Networking, Storage and Analysis, SC 2015 ; Conference date: 16-11-2014 Through 20-11-2014",
year = "2015",
month = nov,
day = "15",
doi = "10.1145/2834976.2834981",
language = "English",
series = "Proceedings of PDSW 2015: 10th Parallel Data Storage Workshop - Held in conjunction with SC 2015: The International Conference for High Performance Computing, Networking, Storage and Analysis",
publisher = "Association for Computing Machinery",
pages = "19--24",
booktitle = "Proceedings of PDSW 2015",
address = "United States",
}