@inproceedings{6117d0e67e1b4e1b8e5a9827c582f6af,
title = "DIBS: A data integration benchmark suite",
abstract = "As the generation of data becomes more prolific, the amount of time and resources necessary to perform analyses on these data increases. What is less understood, however, is the data preprocessing steps that must be applied before any meaningful analysis can begin. This problem of taking data in some initial form and transforming it into a desired one is known as data integration. Here, we introduce the Data Integration Benchmarking Suite (DIBS), a suite of applications that are representative of data integration workloads across many disciplines. We apply a comprehensive characterization to these applications to better understand the general behavior of data integration tasks. As a result of our benchmark suite and characterization methods, we offer insight regarding data integration tasks that will guide other researchers designing solutions in this area.",
keywords = "Big data, Data integration, Data wrangling",
author = "Cabrera, {Anthony M.} and Faber, {Clayton J.} and Kyle Cepeda and Robert Derber and Cooper Epstein and Jason Zheng and Cytron, {Ron K.} and Chamberlain, {Roger D.}",
note = "Publisher Copyright: {\textcopyright} 2018 Copyright held by the owner/author(s).; 9th ACM/SPEC International Conference on Performance Engineering, ICPE 2018 ; Conference date: 09-04-2018 Through 13-04-2018",
year = "2018",
month = apr,
day = "2",
doi = "10.1145/3185768.3186307",
language = "English",
series = "ICPE 2018 - Companion of the 2018 ACM/SPEC International Conference on Performance Engineering",
publisher = "Association for Computing Machinery, Inc",
pages = "25--28",
booktitle = "ICPE 2018 - Companion of the 2018 ACM/SPEC International Conference on Performance Engineering",
}