@inproceedings{600ec041e5654f79a69d883de9b56798,
title = "Detecting Web Spam in Webgraphs with Predictive Model Analysis",
abstract = "Web spam is a serious threat for both end-users and search engines (w.r.t., query cost). Webgraphs can be exploited in detecting spam. In the past, several graph mining techniques were applied to measure metrics for pages and hyperlinks. In this paper, we justify the importance of webgraph to distinguish spam websites from non-spam ones based on several graph metrics computed for a labelled dataset (WEBSPAM-UK2007) and justify our model by testing on uk-2014 dataset, the most recently available dataset on the same (uk) domain. WEBSPAM-UK2007 dataset includes 0.1 million different hosts and four kinds of feature sets: Obvious, Link, Transformed Link and Content. We use five prominent machine learning (ML) techniques (i.e., Support Vector Machine (SVM), K-Nearest Neighbor (KNN), Logistic Regression, Na{\"i}ve Bayes and Random Forest) to build a ML-based classifier. To evaluate the performance of our classifier, we compute accuracy and F-1 score and perform 10-fold cross validation. We also compare graph based features with content based textual features and find that graph properties are similar or better than text properties. We achieve above 99% training accuracy for most of our machine learning models. We test our model with uk-2014 dataset with 4.7 million hosts for the graph-based feature sets and achieve accuracy in between 90-94% for most of the models. To the best of our knowledge, prior works on web spam detection with WEBSPAM-UK2007 dataset did not use different test dataset for their models. Our model classifier is capable of detecting web spam for any input webgraph based on its graph metrics features.",
keywords = "graph mining, machine learning, security, web spam, webgraphs",
author = "Sattar, {Naw Safrin} and Shaikh Arifuzzaman and Zibran, {Minhaz F.} and Sakib, {Md Mohiuddin}",
note = "Publisher Copyright: {\textcopyright} 2019 IEEE.; 2019 IEEE International Conference on Big Data, Big Data 2019 ; Conference date: 09-12-2019 Through 12-12-2019",
year = "2019",
month = dec,
doi = "10.1109/BigData47090.2019.9006282",
language = "English",
series = "Proceedings - 2019 IEEE International Conference on Big Data, Big Data 2019",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "4299--4308",
editor = "Chaitanya Baru and Jun Huan and Latifur Khan and Hu, {Xiaohua Tony} and Ronay Ak and Yuanyuan Tian and Roger Barga and Carlo Zaniolo and Kisung Lee and Ye, {Yanfang Fanny}",
booktitle = "Proceedings - 2019 IEEE International Conference on Big Data, Big Data 2019",
}