@inproceedings{69f1ffb82e4b4a20ba87e1c403aabe50,
title = "SORA: Scalable Overlap-graph Reduction Algorithms for Genome Assembly using Apache Spark in the Cloud",
abstract = "The advent of high-throughput DNA sequencing techniques has permitted very high quality de novo assemblies of genomes, but raise an issue of requiring large amounts of computer memory to resolve the large genome graphs generated by most overlap graph de novo assemblers. To address these limitations, we present a novel algorithmic approach; Scalable Overlap-graph Reduction Algorithms (SORA). SORA adapts string graph reduction algorithms for the genome assembly using a distributed computing platform. To efficiently compute coverage for enormous paths in the graphs, SORA uses Apache Spark which is a cluster-based engine designed on top of Hadoop to handle very large datasets in the cloud. The experimental results show that SORA can process a nearly one billion edge graph in a distributed cloud cluster as well as smaller graphs on a local cluster with a short turnaround time. Moreover, our algorithms scale almost linearly with increasing numbers of virtual instances in the cloud. SORA is freely available for download at https://github.com/BioHPC/SORA/.",
keywords = "apache spark, cloud, genome assembly, graph reduction, overlap-layout-consensus",
author = "Paul, {Alexander J.} and Dylan Lawrence and Myoungkyu Song and Lim, {Seung Hwan} and Chongle Pan and Ahn, {Tae Hyuk}",
note = "Publisher Copyright: {\textcopyright} 2018 IEEE.; 2018 IEEE International Conference on Bioinformatics and Biomedicine, BIBM 2018 ; Conference date: 03-12-2018 Through 06-12-2018",
year = "2019",
month = jan,
day = "21",
doi = "10.1109/BIBM.2018.8621546",
language = "English",
series = "Proceedings - 2018 IEEE International Conference on Bioinformatics and Biomedicine, BIBM 2018",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "718--723",
editor = "Harald Schmidt and David Griol and Haiying Wang and Jan Baumbach and Huiru Zheng and Zoraida Callejas and Xiaohua Hu and Julie Dickerson and Le Zhang",
booktitle = "Proceedings - 2018 IEEE International Conference on Bioinformatics and Biomedicine, BIBM 2018",
}