#file qc.conf
seqspark {
  #project name, will be used to create output folder and as suffix for default export path
  project = demo
 
  #number of partitions of large datasets
  partitions = 10

  #pipeline to run, here only annotation
  pipeline = [ "annotation", "qualityControl" ]

  #input genotype path
  input.genotype.path = "demo.vcf.bz2"

  #input phenotype path
  input.phenotype.path = "demo.tsv"

  #output genotype config
  output.genotype {

    #export the VCF after qc
    export = true

    #don't include any sample/genotype. i.e. just the variants information
    samples = none

  }


  #annotation object
  annotation {
 
    #what will be added to the INFO field of the input VCF
    addInfo {  

      #a key "gnomAD" will be added if the variant is in the database
      gnomAD = "gnomad"

      #get the AF value from gnomad and name it "Total_AF"
      Total_AF = "gnomad.AF"

      #get the AC_NFE and AN_NFE counts from gnomad, and calculate the ratio, name it "SS_AF"
      SS_AF = "gnomad.AC_NFE/gnomad.AN_NFE"
    }
    db {
    #gnomad database config
    gnomad = {

      #where is the database, can be on HDFS or local, seqspark will search HDFS first
      path = "seqspark/gnomad.exome.vcf.bz2"

      #format of the database, here VCF. Can be tsv, csv
      format = "vcf"
    }
    }
  }
  
  #qualityControl object
  qualityControl {

    #genotype level QC
    genotypes = ["DP >= 8 and GQ >= 20"]
    
    #variant level QC
    #here gnomAD mean the variant must be in gnomAD. You must put it in the addInfo.
    variants = [ "missingRate <= 0.1", "gnomAD" ]

    #perform PCA and calculate tivt ratios.
    summaries = [ "pca", "titv" ]
  }
}
