@inproceedings{83894bb864a24ee9b255f2b324b004b5,
title = "Canaries in a coal mine: Using application-level checkpoints to detect memory failures",
abstract = "Memory failures in future extreme scale applications are a significant concern in the high-performance computing community and have attracted much research attention. We contend in this paper that using application checkpoint data to detect memory failures has potential benefits and is preferable to examining application memory. To support this contention, we describe the application of machine learning techniques to evaluate the veracity of checkpoint data. Our preliminary results indicate that supervised decision tree machine learning approaches can effectively detect corruption in restart files, suggesting that future extreme-scale applications and systems may benefit from incorporating such approaches in order to cope with memory failues.",
author = "Widener, {Patrick M.} and Ferreira, {Kurt B.} and Scott Levy and Nathan Fabian",
note = "Publisher Copyright: {\textcopyright} Springer International Publishing Switzerland 2015.; International Workshops on Parallel Processing Workshops, Euro-Par 2015 ; Conference date: 24-08-2015 Through 25-08-2015",
year = "2015",
doi = "10.1007/978-3-319-27308-2_54",
language = "English",
isbn = "9783319273075",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Verlag",
pages = "669--681",
editor = "Sascha Hunold and Josef Weidendorfer and Domingo Gimenez and Laura Ricci and Stefan Lankes and Alexandru Costan and Varbanescu, {Ana Lucia} and Scott, {Stephen L.} and Requena, {Mar{\'i}a Engracia G{\'o}mez} and Vittorio Scarano and Alexandru Iosup and Michael Alexander",
booktitle = "Euro-Par 2015",
}