@inproceedings{a56dbf8ddabc4572873a69b8574b963a,
title = "Skluma: A statistical learning pipeline for taming unkempt data repositories",
abstract = "Scientists' capacity to make use of existing data is predicated on their ability to find and understand those data. While significant progress has been made with respect to data publication, and indeed one can point to a number of well organized and highly utilized data repositories, there remain many such repositories in which archived data are poorly described and thus impossible to use. We present Skluma-an automated system designed to process vast amounts of data and extract deeply embedded metadata, latent topics, relationships between data, and contextual metadata derived from related documents. We show that Skluma can be used to organize and index a large climate data collection that totals more than 500GB of data in over a half-million files.",
keywords = "Data integration, Data wrangling, Metadata extraction, Statistical learning",
author = "Paul Beckman and Skluzacek, \{Tyler J.\} and Kyle Chard and Ian Foster",
note = "Publisher Copyright: {\textcopyright} 2017 Copyright held by the owner/author(s).; 29th International Conference on Scientific and Statistical Database Management, SSDBM 2017 ; Conference date: 27-06-2017 Through 29-06-2017",
year = "2017",
month = jun,
day = "27",
doi = "10.1145/3085504.3091116",
language = "English",
series = "ACM International Conference Proceeding Series",
publisher = "Association for Computing Machinery",
booktitle = "SSDBM 2017",
}