@inproceedings{0f102c6d010d4fa1b49eac8c3151b592,
title = "DataQuest: An Approach to Automatically Extract Dataset Mentions from Scientific Papers",
abstract = "The rapid growth of scientific literature is presenting several challenges for the search and discovery of research artifacts. Datasets are the backbone of scientific experiments. It is crucial to locate the datasets used or generated by previous research as building suitable datasets is costly in terms of time, money, and human labor. Hence automated mechanisms to aid the search and discovery of datasets from scientific publications can aid reproducibility and reusability of these valuable scientific artifacts. Here in this work, utilizing the next sentence prediction capability of language models, we show that a BERT-based entity recognition model with POS aware embedding can be effectively used to address this problem. Our investigation shows that identifying sentences containing dataset mentions in the first place proves critical to the task. Our method outperforms earlier ones and achieves an F1 score of 56.2 in extracting dataset mentions from research papers on a popular corpus of social science publications. We make our codes available at https://github.com/sandeep82945/data_discovery.",
keywords = "Dataset discovery, Dataset mention extraction, Deep learning, Publication mining",
author = "Sandeep Kumar and Tirthankar Ghosal and Asif Ekbal",
note = "Publisher Copyright: {\textcopyright} 2021, Springer Nature Switzerland AG.; 23rd International Conference on Asia-Pacific Digital Libraries, ICADL 2021 ; Conference date: 01-12-2021 Through 03-12-2021",
year = "2021",
doi = "10.1007/978-3-030-91669-5_4",
language = "English",
isbn = "9783030916688",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Science and Business Media Deutschland GmbH",
pages = "43--53",
editor = "Hao-Ren Ke and Lee, {Chei Sian} and Kazunari Sugiyama",
booktitle = "Towards Open and Trustworthy Digital Societies - 23rd International Conference on Asia-Pacific Digital Libraries, ICADL 2021, Proceedings",
}