@inproceedings{44a8652487ac4dcb84c9f7ac36afb43a,
title = "Accurate fault prediction of Blue Gene/P RAS logs via geometric reduction",
abstract = "This investigation presents two distinct and novel approaches for the prediction of system failures occurring in Oak Ridge National Laboratory's Blue Gene/P supercomputer. Each technique uses raw numeric and textual subsets of large data logs of physical system information such as fan speeds and CPU temperatures. This data is used to develop models of the system capable of sensing anomalies, or deviations from nominal behavior. Each algorithm predicted event log reported anomalies in advance of their occurrence and one algorithm did so without false positives. Both algorithms predicted an anomaly that did not appear in the event log. It was later learned that the fault missing from the log but predicted by both algorithms was confirmed to have occurred by the system administrator.",
keywords = "Fault prediction, High performance computing, MSET, NMF, Resiliency",
author = "Joshua Thompson and Dreisigmeyer, {David W.} and Terry Jones and Michael Kirby and Joshua Ladd",
year = "2010",
doi = "10.1109/DSNW.2010.5542626",
language = "English",
isbn = "9781424477302",
series = "Proceedings of the International Conference on Dependable Systems and Networks",
pages = "8--14",
booktitle = "2010 International Conference on Dependable Systems and Networks Workshops, DSN-W 2010",
note = "2010 International Conference on Dependable Systems and Networks Workshops, DSN-W 2010 ; Conference date: 28-06-2010 Through 01-07-2010",
}