#file demo.conf
seqspark {
  #project name, will be used to create output folder and as suffix for default export path
  project = demo
 
  #number of partitions of large datasets
  partitions = 10

  #pipeline to run, here only annotation
  pipeline = [ "annotation", "qualityControl", "association" ]

  #input genotype path
  input.genotype.path = "demo.vcf.bz2"

  #input phenotype path
  input.phenotype.path = "demo.tsv"

  #output genotype config
  output.genotype {

    #export the VCF after annotation
    export = true

    #don't include any sample/genotype. i.e. just the variants information
    samples = none
  }

  #annotation object
  annotation {
 
    #what will be added to the INFO field of the input VCF
    addInfo {  

      #a key "gnomAD" will be added if the variant is in the database
      gnomAD = "gnomad"

      #get the AF value from gnomad and name it "Total_AF"
      Total_AF = "gnomad.AF"

      #get the AC_NFE and AN_NFE counts from gnomad, and calculate the ratio, name it "SS_AF"
      SS_AF = "gnomad.AC_NFE/gnomad.AN_NFE"
    }
    db {
    #gnomad database config
    gnomad = {

      #where is the database, can be on HDFS or local, seqspark will search HDFS first
      path = "seqspark/gnomad.exome.vcf.bz2"

      #format of the database, here VCF. Can be tsv, csv
      format = "vcf"
    }
    }
  }
  
  #qualityControl object
  qualityControl {

    #genotype level QC
    genotypes = ["DP >= 8 and GQ >= 20"]
    
    #variant level QC
    #here gnomAD mean the variant must be in gnomAD. You must put it in the addInfo.
    variants = [ "missingRate <= 0.1", "gnomAD" ]

    #perform PCA and calculate tivt ratios.
    summaries = [ "pca", "titv" ]
  }
  
  #association object 
  association {

    #trait configures phenotypes
    trait {
      
      #list of traits to run
      list = ["bmi"]

      #a trait named "bmi"
      bmi {
        
        #it is not binary, therefore quantitative
        binary = false
  
        #covariates in the phenotype file to use
        covariates = ["sex", "age", "disease"]

        #do not use PC terms in the association analysis
        pc = 0
      }
    }
 
    #method configires the methods 
    method {

      #methods list to run
      list = ["snv", "cmc", "brv", "skat", "skato", "cmc2", "brv2", "skat2", "skato2"]

      #set maf source to SS_AF, which we computed from annotation database gnomad
      snv.maf.source = "SS_AF"
      cmc.maf.source = "SS_AF"
      brv.maf.source = "SS_AF"
      skat.maf.source = "SS_AF"
      skato.maf.source = "SS_AF"

      #set type and maf for for another four analyses
      #this will allow analyses being run twice with different MAF cutoff
      cmc2 {type : "cmc", maf {source : "SS_AF", cutoff : 0.005}}
      brv2 {type : "brv", maf {source : "SS_AF", cutoff : 0.005}}
      skat2 {type : "skat", maf {source : "SS_AF", cutoff : 0.005}}
      skato2 {type : "skato", maf {source : "SS_AF", cutoff : 0.005}}
    }
  }
}
