基因数据处理81之callVariant实现类callVariantByAvocado

来源:互联网 发布:网络管理员好考吗 知乎 编辑:程序博客网 时间:2024/05/17 03:41

1.代码:

package org.gcdss.cli.callVariantimport java.text.SimpleDateFormatimport java.util._import org.apache.spark.{SparkConf, SparkContext}import org.bdgenomics.adam.rdd.ADAMContext._import org.gcdss.cli.Gcdss//import org.bdgenomics.avocado.AvocadoFunSuiteclass callVariantByAvocado(fqFile: String, faFile: String, output: String, configFile: String) {  def run(sc: SparkContext): Unit = {    println("start run:")    val fqLoad = sc.loadAlignments(fqFile)    val faLoad = sc.loadFasta(faFile, 10000)    println("fqFile:" + fqFile)    println("faFile:" + faFile)    println("configFile:" + configFile)    println("output:" + output)    println("fqLoad.count:" + fqLoad.count)    println("faLoad.count:" + faLoad.count)    Gcdss(Array(fqFile, faFile, output, configFile)).run(sc)    val read1 = sc.loadAlignments(output)    println("read parquet:" + read1.count())    read1.foreach(println)    println("*************end*************")  }}object callVariantByAvocado {  def main(args: Array[String]) {    if (args.length < 4) {      System.err.println("at least three argument required, e.g. vcfFile dbSnp2omimFile omimFile outputPath")      System.exit(1)    }    println("start main:")    var conf = new SparkConf().setAppName(this.getClass().getSimpleName().filter(!_.equals('$')))    val sc = new SparkContext(conf)    val iString = new SimpleDateFormat("yyyyMMddHHmmssSSS").format(new Date())    val callVariantByAvocadoMaster = new callVariantByAvocado(args(0), args(1), args(2) + iString, args(3))    callVariantByAvocadoMaster.run(sc)    sc.stop()  }  def runLocal(args: Array[String]): Unit = {    if (args.length < 4) {      System.err.println("at least three argument required, e.g. vcfFile dbSnp2omimFile omimFile outputPath")      System.exit(1)    }    var conf = new SparkConf().setAppName(this.getClass().getSimpleName().filter(!_.equals('$'))).setMaster("local[4]")    val sc = new SparkContext(conf)    val iString = new SimpleDateFormat("yyyyMMddHHmmssSSS").format(new Date())    val callVariantByAvocadoLocal = new callVariantByAvocado(args(0), args(1), args(2) + iString, args(3))    callVariantByAvocadoLocal.run(sc)    sc.stop()  }}

2.测试:

package org.gcdss.cli.callVariantimport org.scalatest.FunSuite/**  * Created by xubo on 2016/6/9.  */class callVariantByAvocadoLocalSuite extends FunSuite {  test("artificial fa By runLocal") {//    val iString = new SimpleDateFormat("yyyyMMddHHmmssSSS").format(new Date())    println("start callVariantByAvocadoLocalSuite:")    val fqFile = "hdfs://219.219.220.149:9000/xubo/callVariant/avocado/artificial/artificial.realigned.sam"    val faFile = "hdfs://219.219.220.149:9000/xubo/callVariant/avocado/artificial/artificial.fa"    val output = "hdfs://219.219.220.149:9000/xubo/callVariant/avocado/output/artificialT"    val configFile = "D:/all/idea/gcdss-master/file/avocado-sample-configs/basic.properties"    callVariantByAvocado.runLocal(Array(fqFile, faFile, output, configFile))    println("end")    //      .runLocal  }}

运行结果:

start run:2016-06-09 13:18:07 WARN  :139 - Your hostname, xubo-PC resolves to a loopback/non-reachable address: fe80:0:0:0:482:722f:5976:ce1f%20, but we couldn't find any external IP address!fqFile:hdfs://219.219.220.149:9000/xubo/callVariant/avocado/artificial/artificial.realigned.samfaFile:hdfs://219.219.220.149:9000/xubo/callVariant/avocado/artificial/artificial.faconfigFile:D:/all/idea/gcdss-master/file/avocado-sample-configs/basic.propertiesoutput:hdfs://219.219.220.149:9000/xubo/callVariant/avocado/output/artificialT20160609131806104fqLoad.count:10faLoad.count:1Loading reads in from hdfs://219.219.220.149:9000/xubo/callVariant/avocado/artificial/artificial.realigned.samSLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".SLF4J: Defaulting to no-operation (NOP) logger implementationSLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.read parquet:11{"variant": {"variantErrorProbability": null, "contig": {"contigName": "artificial", "contigLength": 1120, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": null}, "start": 33, "end": 44, "referenceAllele": "AGGGGGGGGGG", "alternateAllele": "A", "svAllele": null, "isSomatic": false}, "variantCallingAnnotations": {"variantIsPassing": null, "variantFilters": [], "downsampled": null, "baseQRankSum": null, "fisherStrandBiasPValue": "Infinity", "rmsMapQ": 94.12757, "mapq0Reads": null, "mqRankSum": null, "readPositionRankSum": null, "genotypePriors": [], "genotypePosteriors": [], "vqslod": null, "culprit": null, "attributes": {}}, "sampleId": "sequencing_center", "sampleDescription": null, "processingDescription": null, "alleles": ["Ref", "Alt"], "expectedAlleleDosage": null, "referenceReadDepth": 0, "alternateReadDepth": 5, "readDepth": 5, "minReadDepth": null, "genotypeQuality": 18, "genotypeLikelihoods": [-1.1486835E-6, -3.465736, -77.136604], "nonReferenceLikelihoods": [-1.1486835E-6, -3.465736, -77.136604], "strandBiasComponents": [], "splitFromMultiAllelic": false, "isPhased": false, "phaseSetId": null, "phaseQuality": null}{"variant": {"variantErrorProbability": null, "contig": {"contigName": "artificial", "contigLength": 1120, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": null}, "start": 54, "end": 55, "referenceAllele": "G", "alternateAllele": "A", "svAllele": null, "isSomatic": false}, "variantCallingAnnotations": {"variantIsPassing": null, "variantFilters": [], "downsampled": null, "baseQRankSum": null, "fisherStrandBiasPValue": "Infinity", "rmsMapQ": 94.12757, "mapq0Reads": null, "mqRankSum": -1.7320508, "readPositionRankSum": null, "genotypePriors": [], "genotypePosteriors": [], "vqslod": null, "culprit": null, "attributes": {}}, "sampleId": "sequencing_center", "sampleDescription": null, "processingDescription": null, "alleles": ["Ref", "Alt"], "expectedAlleleDosage": null, "referenceReadDepth": 2, "alternateReadDepth": 3, "readDepth": 5, "minReadDepth": null, "genotypeQuality": 2147483647, "genotypeLikelihoods": [-32.23619, -3.465736, -44.90041], "nonReferenceLikelihoods": [-32.23619, -3.465736, -44.90041], "strandBiasComponents": [], "splitFromMultiAllelic": false, "isPhased": false, "phaseSetId": null, "phaseQuality": null}{"variant": {"variantErrorProbability": null, "contig": {"contigName": "artificial", "contigLength": 1120, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": null}, "start": 55, "end": 56, "referenceAllele": "G", "alternateAllele": "A", "svAllele": null, "isSomatic": false}, "variantCallingAnnotations": {"variantIsPassing": null, "variantFilters": [], "downsampled": null, "baseQRankSum": null, "fisherStrandBiasPValue": "Infinity", "rmsMapQ": 94.12757, "mapq0Reads": null, "mqRankSum": -1.7320508, "readPositionRankSum": null, "genotypePriors": [], "genotypePosteriors": [], "vqslod": null, "culprit": null, "attributes": {}}, "sampleId": "sequencing_center", "sampleDescription": null, "processingDescription": null, "alleles": ["Ref", "Alt"], "expectedAlleleDosage": null, "referenceReadDepth": 2, "alternateReadDepth": 3, "readDepth": 5, "minReadDepth": null, "genotypeQuality": 2147483647, "genotypeLikelihoods": [-32.23619, -3.465736, -44.90041], "nonReferenceLikelihoods": [-32.23619, -3.465736, -44.90041], "strandBiasComponents": [], "splitFromMultiAllelic": false, "isPhased": false, "phaseSetId": null, "phaseQuality": null}{"variant": {"variantErrorProbability": null, "contig": {"contigName": "artificial", "contigLength": 1120, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": null}, "start": 56, "end": 57, "referenceAllele": "G", "alternateAllele": "A", "svAllele": null, "isSomatic": false}, "variantCallingAnnotations": {"variantIsPassing": null, "variantFilters": [], "downsampled": null, "baseQRankSum": null, "fisherStrandBiasPValue": "Infinity", "rmsMapQ": 94.12757, "mapq0Reads": null, "mqRankSum": -1.7320508, "readPositionRankSum": null, "genotypePriors": [], "genotypePosteriors": [], "vqslod": null, "culprit": null, "attributes": {}}, "sampleId": "sequencing_center", "sampleDescription": null, "processingDescription": null, "alleles": ["Ref", "Alt"], "expectedAlleleDosage": null, "referenceReadDepth": 2, "alternateReadDepth": 3, "readDepth": 5, "minReadDepth": null, "genotypeQuality": 2147483647, "genotypeLikelihoods": [-32.23619, -3.465736, -44.90041], "nonReferenceLikelihoods": [-32.23619, -3.465736, -44.90041], "strandBiasComponents": [], "splitFromMultiAllelic": false, "isPhased": false, "phaseSetId": null, "phaseQuality": null}{"variant": {"variantErrorProbability": null, "contig": {"contigName": "artificial", "contigLength": 1120, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": null}, "start": 57, "end": 58, "referenceAllele": "G", "alternateAllele": "A", "svAllele": null, "isSomatic": false}, "variantCallingAnnotations": {"variantIsPassing": null, "variantFilters": [], "downsampled": null, "baseQRankSum": null, "fisherStrandBiasPValue": "Infinity", "rmsMapQ": 94.12757, "mapq0Reads": null, "mqRankSum": -1.7320508, "readPositionRankSum": null, "genotypePriors": [], "genotypePosteriors": [], "vqslod": null, "culprit": null, "attributes": {}}, "sampleId": "sequencing_center", "sampleDescription": null, "processingDescription": null, "alleles": ["Ref", "Alt"], "expectedAlleleDosage": null, "referenceReadDepth": 2, "alternateReadDepth": 3, "readDepth": 5, "minReadDepth": null, "genotypeQuality": 2147483647, "genotypeLikelihoods": [-32.23619, -3.465736, -44.90041], "nonReferenceLikelihoods": [-32.23619, -3.465736, -44.90041], "strandBiasComponents": [], "splitFromMultiAllelic": false, "isPhased": false, "phaseSetId": null, "phaseQuality": null}{"variant": {"variantErrorProbability": null, "contig": {"contigName": "artificial", "contigLength": 1120, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": null}, "start": 58, "end": 59, "referenceAllele": "G", "alternateAllele": "A", "svAllele": null, "isSomatic": false}, "variantCallingAnnotations": {"variantIsPassing": null, "variantFilters": [], "downsampled": null, "baseQRankSum": null, "fisherStrandBiasPValue": "Infinity", "rmsMapQ": 94.12757, "mapq0Reads": null, "mqRankSum": -1.7320508, "readPositionRankSum": null, "genotypePriors": [], "genotypePosteriors": [], "vqslod": null, "culprit": null, "attributes": {}}, "sampleId": "sequencing_center", "sampleDescription": null, "processingDescription": null, "alleles": ["Ref", "Alt"], "expectedAlleleDosage": null, "referenceReadDepth": 2, "alternateReadDepth": 3, "readDepth": 5, "minReadDepth": null, "genotypeQuality": 2147483647, "genotypeLikelihoods": [-32.23619, -3.465736, -44.90041], "nonReferenceLikelihoods": [-32.23619, -3.465736, -44.90041], "strandBiasComponents": [], "splitFromMultiAllelic": false, "isPhased": false, "phaseSetId": null, "phaseQuality": null}{"variant": {"variantErrorProbability": null, "contig": {"contigName": "artificial", "contigLength": 1120, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": null}, "start": 59, "end": 60, "referenceAllele": "G", "alternateAllele": "A", "svAllele": null, "isSomatic": false}, "variantCallingAnnotations": {"variantIsPassing": null, "variantFilters": [], "downsampled": null, "baseQRankSum": null, "fisherStrandBiasPValue": "Infinity", "rmsMapQ": 94.12757, "mapq0Reads": null, "mqRankSum": -1.7320508, "readPositionRankSum": null, "genotypePriors": [], "genotypePosteriors": [], "vqslod": null, "culprit": null, "attributes": {}}, "sampleId": "sequencing_center", "sampleDescription": null, "processingDescription": null, "alleles": ["Ref", "Alt"], "expectedAlleleDosage": null, "referenceReadDepth": 2, "alternateReadDepth": 3, "readDepth": 5, "minReadDepth": null, "genotypeQuality": 2147483647, "genotypeLikelihoods": [-32.23619, -3.465736, -44.90041], "nonReferenceLikelihoods": [-32.23619, -3.465736, -44.90041], "strandBiasComponents": [], "splitFromMultiAllelic": false, "isPhased": false, "phaseSetId": null, "phaseQuality": null}{"variant": {"variantErrorProbability": null, "contig": {"contigName": "artificial", "contigLength": 1120, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": null}, "start": 60, "end": 61, "referenceAllele": "G", "alternateAllele": "A", "svAllele": null, "isSomatic": false}, "variantCallingAnnotations": {"variantIsPassing": null, "variantFilters": [], "downsampled": null, "baseQRankSum": null, "fisherStrandBiasPValue": "Infinity", "rmsMapQ": 94.12757, "mapq0Reads": null, "mqRankSum": -1.7320508, "readPositionRankSum": null, "genotypePriors": [], "genotypePosteriors": [], "vqslod": null, "culprit": null, "attributes": {}}, "sampleId": "sequencing_center", "sampleDescription": null, "processingDescription": null, "alleles": ["Ref", "Alt"], "expectedAlleleDosage": null, "referenceReadDepth": 2, "alternateReadDepth": 3, "readDepth": 5, "minReadDepth": null, "genotypeQuality": 2147483647, "genotypeLikelihoods": [-32.23619, -3.465736, -44.90041], "nonReferenceLikelihoods": [-32.23619, -3.465736, -44.90041], "strandBiasComponents": [], "splitFromMultiAllelic": false, "isPhased": false, "phaseSetId": null, "phaseQuality": null}{"variant": {"variantErrorProbability": null, "contig": {"contigName": "artificial", "contigLength": 1120, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": null}, "start": 61, "end": 62, "referenceAllele": "G", "alternateAllele": "A", "svAllele": null, "isSomatic": false}, "variantCallingAnnotations": {"variantIsPassing": null, "variantFilters": [], "downsampled": null, "baseQRankSum": null, "fisherStrandBiasPValue": "Infinity", "rmsMapQ": 94.12757, "mapq0Reads": null, "mqRankSum": -1.7320508, "readPositionRankSum": null, "genotypePriors": [], "genotypePosteriors": [], "vqslod": null, "culprit": null, "attributes": {}}, "sampleId": "sequencing_center", "sampleDescription": null, "processingDescription": null, "alleles": ["Ref", "Alt"], "expectedAlleleDosage": null, "referenceReadDepth": 2, "alternateReadDepth": 3, "readDepth": 5, "minReadDepth": null, "genotypeQuality": 2147483647, "genotypeLikelihoods": [-32.23619, -3.465736, -44.90041], "nonReferenceLikelihoods": [-32.23619, -3.465736, -44.90041], "strandBiasComponents": [], "splitFromMultiAllelic": false, "isPhased": false, "phaseSetId": null, "phaseQuality": null}{"variant": {"variantErrorProbability": null, "contig": {"contigName": "artificial", "contigLength": 1120, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": null}, "start": 62, "end": 63, "referenceAllele": "G", "alternateAllele": "A", "svAllele": null, "isSomatic": false}, "variantCallingAnnotations": {"variantIsPassing": null, "variantFilters": [], "downsampled": null, "baseQRankSum": null, "fisherStrandBiasPValue": "Infinity", "rmsMapQ": 94.12757, "mapq0Reads": null, "mqRankSum": -1.7320508, "readPositionRankSum": null, "genotypePriors": [], "genotypePosteriors": [], "vqslod": null, "culprit": null, "attributes": {}}, "sampleId": "sequencing_center", "sampleDescription": null, "processingDescription": null, "alleles": ["Ref", "Alt"], "expectedAlleleDosage": null, "referenceReadDepth": 2, "alternateReadDepth": 3, "readDepth": 5, "minReadDepth": null, "genotypeQuality": 2147483647, "genotypeLikelihoods": [-32.23619, -3.465736, -44.90041], "nonReferenceLikelihoods": [-32.23619, -3.465736, -44.90041], "strandBiasComponents": [], "splitFromMultiAllelic": false, "isPhased": false, "phaseSetId": null, "phaseQuality": null}{"variant": {"variantErrorProbability": null, "contig": {"contigName": "artificial", "contigLength": 1120, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": null}, "start": 63, "end": 64, "referenceAllele": "G", "alternateAllele": "A", "svAllele": null, "isSomatic": false}, "variantCallingAnnotations": {"variantIsPassing": null, "variantFilters": [], "downsampled": null, "baseQRankSum": null, "fisherStrandBiasPValue": "Infinity", "rmsMapQ": 94.12757, "mapq0Reads": null, "mqRankSum": -1.7320508, "readPositionRankSum": null, "genotypePriors": [], "genotypePosteriors": [], "vqslod": null, "culprit": null, "attributes": {}}, "sampleId": "sequencing_center", "sampleDescription": null, "processingDescription": null, "alleles": ["Ref", "Alt"], "expectedAlleleDosage": null, "referenceReadDepth": 2, "alternateReadDepth": 3, "readDepth": 5, "minReadDepth": null, "genotypeQuality": 2147483647, "genotypeLikelihoods": [-32.23619, -3.465736, -44.90041], "nonReferenceLikelihoods": [-32.23619, -3.465736, -44.90041], "strandBiasComponents": [], "splitFromMultiAllelic": false, "isPhased": false, "phaseSetId": null, "phaseQuality": null}*************end*************end

4.脚本:

hadoop@Master:~/xubo/project/callVariant$ cat load2.sh     #!/usr/bin/env bash      spark-submit   \--class  org.gcdss.cli.callVariant.callVariantByAvocado \--master spark://219.219.220.149:7077 \--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \--conf spark.kryo.registrator=org.bdgenomics.adam.serialization.ADAMKryoRegistrator \--jars /home/hadoop/cloud/adam/lib/adam-apis_2.10-0.18.3-SNAPSHOT.jar,/home/hadoop/cloud/adam/lib/adam-cli_2.10-0.18.3-SNAPSHOT.jar,/home/hadoop/cloud/adam/lib/adam-core_2.10-0.18.3-SNAPSHOT.jar,/home/hadoop/cloud/adam/xubo/data/GRCH38Sub/cs-bwamem/BWAMEMSparkAll/gcdss-cli-0.0.3-SNAPSHOT.jar \--executor-memory 4096M \--total-executor-cores 20 BWAMEMSparkAll.jar \/xubo/callVariant/avocado/artificial/artificial.realigned.sam /xubo/callVariant/avocado/artificial/artificial.fa /xubo/callVariant/avocado/output/artificialT /home/hadoop/xubo/data/testTools/basic.properties

运行记录:

hadoop@Master:~/xubo/project/callVariant$ ./load2.sh start main:start run:fqFile:/xubo/callVariant/avocado/artificial/artificial.realigned.samfaFile:/xubo/callVariant/avocado/artificial/artificial.faconfigFile:/home/hadoop/xubo/data/testTools/basic.propertiesoutput:/xubo/callVariant/avocado/output/artificialT20160609132038080fqLoad.count:10faLoad.count:1SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".                SLF4J: Defaulting to no-operation (NOP) logger implementationSLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.read parquet:11                                                                 *************end*************

参考

【1】https://github.com/xubo245/AdamLearning【2】https://github.com/bigdatagenomics/adam/ 【3】https://github.com/xubo245/SparkLearning【4】http://spark.apache.org【5】http://stackoverflow.com/questions/28166667/how-to-pass-d-parameter-or-environment-variable-to-spark-job  【6】http://stackoverflow.com/questions/28840438/how-to-override-sparks-log4j-properties-per-driver

研究成果:

【1】 [BIBM] Bo Xu, Changlong Li, Hang Zhuang, Jiali Wang, Qingfeng Wang, Chao Wang, and Xuehai Zhou, "Distributed Gene Clinical Decision Support System Based on Cloud Computing", in IEEE International Conference on Bioinformatics and Biomedicine. (BIBM 2017, CCF B)【2】 [IEEE CLOUD] Bo Xu, Changlong Li, Hang Zhuang, Jiali Wang, Qingfeng Wang, Xuehai Zhou. Efficient Distributed Smith-Waterman Algorithm Based on Apache Spark (CLOUD 2017, CCF-C).【3】 [CCGrid] Bo Xu, Changlong Li, Hang Zhuang, Jiali Wang, Qingfeng Wang, Jinhong Zhou, Xuehai Zhou. DSA: Scalable Distributed Sequence Alignment System Using SIMD Instructions. (CCGrid 2017, CCF-C).【4】more: https://github.com/xubo245/Publications

Help

If you have any questions or suggestions, please write it in the issue of this project or send an e-mail to me: xubo245@mail.ustc.edu.cnWechat: xu601450868QQ: 601450868
阅读全文
'); })();
0 0
原创粉丝点击
热门IT博客
热门问题 老师的惩罚 人脸识别 我在镇武司摸鱼那些年 重生之率土为王 我在大康的咸鱼生活 盘龙之生命进化 天生仙种 凡人之先天五行 春回大明朝 姑娘不必设防,我是瞎子 是金子总会发光 总会想起那张照片作文600字 但是天总会黑人总要离别 中华慈善总会 时间总会说出再见 换季嘴里总会烂怎么回事 郑州慈善总会 男主总会变成恐怖boss 风雨过后总会有彩虹 双性总会变成rbq 付出总会有收获经典语录 总会想起那张小学毕业照 总会想起那张照片作文 中华体育总会 总体安全观 总体 西咸新区总体规划 城市总体规划 统筹推进总体布局 民俗学总体分为哪几部分 总体布局和战略布局 领导班子总体评价 把生态文明建设纳入总体布局 生产总值 总值 国内生产总值 人均生产总值 国民生产总值排名 国民生产总值名词解释 国民生产总值和国内生产总值 劳动生产总值 中国生产总值 地区生产总值 国内生产总值平减指数 国内生产总值指数 2018国内生产总值 中国国民生产总值 2019国内生产总值 人均国内生产总值 中国国内生产总值 实际国内生产总值