Changes - Statistical Genetics Courses

AdvGeneMap2018Commands

5,780 bytes added, 19:46, 23 January 2018

/* GxG Interaction */

===~~GenABEL~~Functional Annotation=== table_annovar.pl table_annovar.pl APOC3.vcf humandb/ -buildver hg19 -out APOC3_Gene.vcf -remove -nastring . -protocol refGene -operation g -vcfinput cat APOC3_Gene.vcf.hg19_multianno.txt table_annovar.pl APOC3.vcf humandb/ -buildver hg19 -out APOC3_Gene.vcf -remove -nastring . -protocol refGene,knownGene,ensGene -operation g,g,g -arg '-splicing 12 -exonicsplicing','-splicing 12 -exonicsplicing','-splicing 12 -exonicsplicing' -vcfinput awk -F'\t' '{print $1,$2,$6,$7,$8,$9,$10}' APOC3_Gene.vcf.hg19_multianno.txt table_annovar.pl APOC3.vcf humandb/ -buildver hg19 -out APOC3_Region.vcf -remove -nastring . -protocol phastConsElements46way -operation r -vcfinput table_annovar.pl APOC3.vcf humandb/ -buildver hg19 -out APOC3_Region.vcf -remove -nastring . -protocol gwasCatalog -operation r -vcfinput table_annovar.pl APOC3.vcf humandb/ -buildver hg19 -out APOC3_Filter.vcf -remove -nastring . -protocol gnomad_genome,gnomad_exome,popfreq_max_20150413,gme,avsnp150,dbnsfp33a,dbscsnv11,cadd13gt20,clinvar_20170905,gwava -operation f,f,f,f,f,f,f,f,f,f -vcfinput awk -F'\t' '{print $1,$2,$103,$104}' APOC3_Filter.vcf.hg19_multianno.txt awk -F'\t' '{print $1,$2,$6,$14}' APOC3_Filter.vcf.hg19_multianno.txt awk -F'\t' '{print $1,$2,$15,$16,$17,$18,$19,$20,$21,$22}' APOC3_Filter.vcf.hg19_multianno.txt awk -F'\t' '{print $1,$2,$36,$86,$70}' APOC3_Filter.vcf.hg19_multianno.txt awk -F'\t' '{print $1,$2,$99,$100}' APOC3_Filter.vcf.hg19_multianno.txt table_annovar.pl APOC3.vcf humandb/ -buildver hg19 -out APOC3_ANN.vcf -remove -nastring . -protocol refGene,knownGene,ensGene,wgRna,targetScanS,phastConsElements46way,tfbsConsSites,gwasCatalog,gnomad_genome,gnomad_exome,popfreq_max_20150413,gme,avsnp150,dbnsfp33a,dbscsnv11,cadd13gt20,clinvar_20170905,gwava -operation g,g,g,r,r,r,r,r,f,f,f,f,f,f,f,f,f,f -arg '-splicing 12 -exonicsplicing','-splicing 12 -exonicsplicing','-splicing 12 -exonicsplicing',,,,,,,,,,,,,,, -vcfinput

===GenABEL===

# Load files

library(GenABEL)

convert.snp.tped(tped ~~&eq;~~ = "gwa_gabel_qtl.tped", tfam ~~&eq;~~ = "gwa_gabel_qtl.tfam", out ~~&eq;~~ = "gwa_gabel_qtl.raw", strand ~~&eq;~~ = "u") g.dat <- load.gwaa.data(phen ~~&eq;~~ = "gwa_gabel_qtl.praw", gen ~~&eq;~~ = "gwa_gabel_qtl.raw", force ~~&eq;~~ = T)

slotNames(g.dat)

slotNames(g.dat@gtdata)

# Trait

summary(g.dat@phdata$disease)

hist(g.dat@phdata$disease, main~~&eq;~~="Quantitative Phenotype data summary", xlab ~~&eq;~~ = "Systolic pressure measure", freq ~~&eq;~~ = F,breaks~~&eq;~~=20, col~~&eq;~~="gray")

rug(g.dat@phdata$disease)

###

# GLM test

test.snp <- scan.glm('disease ~ CRSNP', family ~~&eq;~~ = gaussian(), data ~~&eq;~~ = g.dat)

names(test.snp)

alpha <- 5e-8

test.snp$P1df[test.snp$P1df < alpha]

# Score test

test.qt <- qtscore(disease, data ~~&eq;~~ = g.dat, trait ~~&eq;~~ = "gaussian")

slotNames(test.qt)

names(test.qt@results)

obs <- sort(results(test.qt)$P1df)

ept <- c(1:length(obs)) / (length(obs) + 1)

plot(-log10(ept), -log10(obs), main ~~&eq;~~ = "GWAS QQ plot, qtl", xlab~~&eq;~~="Expected -log10(pvalue)", ylab~~&eq;~~="Observed -log10(pvalue)") abline(0, 1, col ~~&eq;~~ = "red") abline(h ~~&eq;~~ = 8, lty ~~&eq;~~ = 2)

# Manhattan plot

plot(test.qt, col ~~&eq;~~ = "black")

# Adding confounders

test.qt.sex <- qtscore(disease ~ sex, data ~~&eq;~~ = g.dat, trait ~~&eq;~~ = "gaussian")

rownames(results(test.qt.sex))[results(test.qt)$P1df < alpha]

summary(lm(disease ~ sex, data ~~&eq;~~ = g.dat))

###

# MDS

###

gkin <- ibs(g.dat, weight ~~&eq;~~ = "freq")

gkin[1:10,1:10]

cps.full <- cmdscale(as.dist(.5 - gkin), eig ~~&eq;~~ = T, k ~~&eq;~~ = 10)

names(cps.full)

cps <- cps.full$points

plot(cps[,1], cps[,2], pch ~~&eq;~~ = g.dat@phdata$popn) legend(-0.16, 0.06, c("TSI","MEX", "CEU"), pch ~~&eq;~~ = c(1,2,3))

###

# Corrected test

gpc.dat <- g.dat

gpc.dat@phdata<-cbind(g.dat@phdata, cps)

test.pc.a <- scan.glm('disease ~ CRSNP + C1 + C2 + C3 + C4 + C5', family~~&eq;~~=gaussian(), data ~~&eq;~~ = gpc.dat)

test.pc.a$snpnames[test.pc.a$P1df < alpha]

test.pc.a$P1df[test.pc.a$P1df < alpha]

test.pc.b <- qtscore(disease ~ C1 + C2 + C3 + C4 + C5, data ~~&eq;~~ = gpc.dat, trait ~~&eq;~~ = "gaussian")

test.pc.b@lambda

# scree plot

plot(cps.full$eig[1:10]/sum(cps.full$eig), axes ~~&eq;~~ = F, type ~~&eq;~~ = "b", xlab ~~&eq;~~ = "Components", ylim ~~&eq;~~ = c(0,0.05), ylab ~~&eq;~~ = "Proportion of Variations", main ~~&eq;~~ = "MDS analysis scree plot")

axis(1, 1:10)

axis(2)

# cumulative plot

plot(cumsum(cps.full$eig[1:10])/sum(cps.full$eig), axes ~~&eq;~~ = F, type ~~&eq;~~ = "b", ylim ~~&eq;~~ = c(0,0.2), xlab ~~&eq;~~ = "Components", ylab ~~&eq;~~ = "Proportion of Variations", main ~~&eq;~~ = "MDS analysis cumulative plot")

axis(1, 1:10)

axis(2)

# Check for inflation of statistic

obs <- sort(results(test.qt)$chi2.1df)

ept <- sort(qchisq(1:length(obs) / (length(obs) + 1), df ~~&eq;~~ = 1)) plot(ept, obs, main ~~&eq;~~ = "Genomic control (slope is the inflation factor)", xlab~~&eq;~~="Expected chisq, 1df", ylab~~&eq;~~="Observed chisq, 1df") abline(0, 1, col ~~&eq;~~ = "red") abline(0, test.qt@lambda[1], lty ~~&eq;~~ = 2)

# Definition of GIF

# Conventional definition

obs <- sort(results(test.qt)$Pc1df)

ept <- c(1:length(obs)) / (length(obs) + 1)

plot(-log10(ept), -log10(obs), main ~~&eq;~~ = "GWAS QQ plot adj. via Genomic Control", xlab~~&eq;~~="Expected -log10(pvalue)", ylab~~&eq;~~="Observed -log10(pvalue)") abline(0, 1, col ~~&eq;~~ = "red") abline(h ~~&eq;~~ = 8, lty ~~&eq;~~ = 2)

# EIGENSTRAT

adj.gkin ~~&eq;~~ = gkin diag(adj.gkin) ~~&eq;~~ = hom(g.dat)$Var # naxes ~~&eq;~~ = 3 is default value test.eg <- egscore(disease, data ~~&eq;~~ = g.dat, kin ~~&eq;~~ = adj.gkin, naxes ~~&eq;~~ = 2)

descriptives.scan(test.eg)

snp.eg <- row.names(results(test.eg))[results(test.eg)$P1df < alpha]

# Change #PCs

for (k in 1:10){

test.tmp <- egscore(disease, data ~~&eq;~~ = g.dat, kin ~~&eq;~~ = adj.gkin, naxes ~~&eq;~~ = k)

print(test.tmp@lambda$estimate)

}

obs <- sort(results(test.eg)$Pc1df)

ept <- c(1:length(obs)) / (length(obs) + 1)

qqplot(-log10(ept), -log10(obs), main ~~&eq;~~ = "GWAS QQ plot adj. w/ EIGENSTRAT", xlab~~&eq;~~="Expected -log10(pvalue)", ylab~~&eq;~~="Observed -log10(pvalue)") abline(0, 1, col ~~&eq;~~ = "red") abline(h ~~&eq;~~ = 8, lty ~~&eq;~~ = 2)

# Manhattan plot comparison

plot(test.qt, col ~~&eq;~~ = "black") add.plot(test.eg, col ~~&eq;~~ = "gray", pch ~~&eq;~~ = 3) legend("topright", c("Original plot","After correction w/ EIGENSTRAT"), pch ~~&eq;~~ = c(1,3))

###

# Basic test, binary trait

###

# load files to GenABEL

convert.snp.tped(tped ~~&eq;~~ = "gwa_gabel.tped", tfam ~~&eq;~~ = "gwa_gabel.tfam", out ~~&eq;~~ = "gwa_gabel.raw", strand ~~&eq;~~ = "u") b.dat <- load.gwaa.data(phen ~~&eq;~~ = "gwa_gabel.praw", gen ~~&eq;~~ = "gwa_gabel.raw", force ~~&eq;~~ = T)

slotNames(b.dat)

slotNames(b.dat@gtdata)

b.dat@gtdata@nids

# number of cases and controls

case.size <- length(which(b.dat@phdata$disease ~~&eq;&eq;~~ == 1)) control.size <- length(which(b.dat@phdata$disease ~~&eq;&eq;~~ == 0))

case.size

control.size

snpsb.total <- b.dat@gtdata@nsnps

# GLM test

testb.snp <- scan.glm('disease ~ CRSNP', family ~~&eq;~~ = binomial(), data ~~&eq;~~ = b.dat)

names(testb.snp)

alpha <- 5e-8

testb.snp$P1df[testb.snp$P1df < alpha]

# Score test

testb.qt <- qtscore(disease, data ~~&eq;~~ = b.dat, trait ~~&eq;~~ = "binomial")

slotNames(testb.qt)

descriptives.scan(testb.qt)

results(testb.qt)$P1df[results(testb.qt)$P1df < alpha]

results(testb.qt)$Pc1df[results(testb.qt)$Pc1df < alpha]

===GxG Interaction===

./plink --noweb --ped simcasecon.ped --map simcasecon.map --assoc

./plink --noweb --ped simcasecon.ped --map simcasecon.map --fast-epistasis

./plink --noweb --ped simcasecon.ped --map simcasecon.map --fast-epistasis --case-only

./plink --noweb --ped simcasecon.ped --map simcasecon.map --epistasis

./plink --noweb --ped simcasecon.ped --map simcasecon.map --recodeA --out recoded

./plink --noweb --ped simcasecon.ped --map simcasecon.map --make-bed --out cassiformat

# The following commands are in the R environment

je <-read.table("cassi.out", header=T)

library(ORMDR)

recoded<-read.table("recoded.raw", header=T)

head(recoded)

newdata<-recoded[7:106]

ormdrdata<-cbind(newdata,recoded$PHENOTYPE-1)

names(ormdrdata)[101]<-"casestatus"

head(ormdrdata)

mdr1<-mdr.c(ormdrdata, colresp=101, cs=1, combi=1, cv.fold = 10)

mdr1$min.comb

mdr2<-mdr.c(ormdrdata, colresp=101, cs=1, combi=2, cv.fold = 10)

mdr2$min.comb

mdr3<-mdr.c(ormdrdata, colresp=101, cs=1, combi=3, cv.fold = 10)

mdr3$min.comb

mdr1$test.erate

mdr2$test.erate

mdr3$test.erate

mdr1mean<-mean(mdr1$test.erate)

mdr2mean<-mean(mdr2$test.erate)

mdr3mean<-mean(mdr3$test.erate)

mdr1mean

mdr2mean

mdr3mean

mdr2$best.combi

mdr2$min.comb

mdr3$best.combi

mdr3$min.comb

logreg12<-glm(casestatus ~ factor(snp1_2)*factor(snp2_1), family=binomial,

data=ormdrdata)

summary(logreg12)

anova(logreg12)

pchisq(701.68,4,lower.tail=F)

pchisq(703.82,8,lower.tail=F)

logreg345<-glm(casestatus ~ factor(snp3_2)*factor(snp4_2)*factor(snp5_2),

family=binomial, data=ormdrdata)

summary(logreg345)

anova(logreg345)

pchisq(45.6,8,lower.tail=F)

q()

### The following commands are in the linux shell

./BEAM3 beam3data.txt -o beam3results

./BEAM3 beam3data.txt -o beam3results -T 10

===Plink - Part 1 - Data QC===

#### in R - open R by simply typing R

setwd("to_your_working_directory/")

sexcheck &eq#61; read.table("GWAS_sex_checking.sexcheck", header&eq#61;T)

names(sexcheck)

sex_problem &eq#61; sexcheck[which(sexcheck$STATUS&eq#61;&eq#61;"PROBLEM"),]

sex_problem

q()

#### in R

setwd("to_your_working_directory/")

dups &eq#61; read.table("duplicates.genome", header &eq#61; T) problem_pairs &eq#61; dups[which(dups$PI_HAT > 0.4),]

problem_pairs

problem_pairs &eq#61; dups[which(dups$PI_HAT > 0.05),] myvars &eq#61; c("FID1", "IID1", "FID2", "IID2", "PI_HAT")

problem_pairs[myvars]

q()

plink --file GWAS_clean3 --het

###### in R

Dataset <- read.table("plink.het", header&eq#61;TRUE, sep&eq#61;"", na.strings&eq#61;"NA", dec&eq#61;".", strip.white&eq#61;TRUE)

mean(Dataset$F)

sd(Dataset$F)

jpeg("hist.jpeg", height&eq#61;1000, width&eq#61;1000) hist(scale(Dataset$F), xlim&eq#61;c(-4,4))

dev.off()

q()

plink --file GWAS_clean3 --pheno pheno.txt --pheno-name Aff --hardy

##### in R

hardy &eq#61; read.table("plink.hwe", header &eq#61; T)

names(hardy)

hwe_prob &eq#61; hardy[which(hardy$P < 0.0000009),]

hwe_prob

q()

plink --file GWAS_clean4 --genome --cluster --mds-plot 10

#### in R

mydata &eq#61; read.table("mds_components.txt", header&eq#61;T) mydata$pch[mydata$Group&eq#61;&eq#61;1 ] <-15 mydata$pch[mydata$Group&eq#61;&eq#61;2 ] <-16 mydata$pch[mydata$Group&eq#61;&eq#61;3 ] <-2 jpeg("mds.jpeg", height&eq#61;500, width&eq#61;500) plot(mydata$C1, mydata$C2 ,pch&eq#61;mydata$pch)

dev.off()

q()

broadqq <-function(pvals, title)

{

observed <- sort(pvals) lobs <- -(log10(observed)) expected <- c(1:length(observed)) lexp <- -(log10(expected / (length(expected)+1))) plot(c(0,7), c(0,7), col&eq#61;"red", lwd&eq#61;3, type&eq#61;"l", xlab&eq#61;"Expected (-logP)", ylab&eq#61;"Observed (-logP)", xlim&eq#61;c(0,max(lobs)), ylim&eq#61;c(0,max(lobs)), las&eq#61;1, xaxs&eq#61;"i", yaxs&eq#61;"i", bty&eq#61;"l", main &eq#61; title) points(lexp, lobs, pch&eq#61;23, cex&eq#61;.4, bg&eq#61;"black") } jpeg("qqplot_compare.jpeg", height&eq#61;1000, width&eq#61;500) par(mfrow&eq#61;c(2,1)) aff_unadj<-read.table("unadj.assoc.logistic", header&eq#61;TRUE) aff_unadj.add.p<-aff_unadj[aff_unadj$TEST&eq#61;&eq#61;c("ADD"),]$P

broadqq(aff_unadj.add.p,"Some Trait Unadjusted")

aff_C1C2<-read.table("PC1-PC2.assoc.logistic", header&eq#61;TRUE) aff_C1C2.add.p<-aff_C1C2[aff_C1C2$TEST&eq#61;&eq#61;c("ADD"),]$P

broadqq(aff_C1C2.add.p, "Some Trait Adjusted for PC1 and PC2")

dev.off()

gws_unadj &eq#61; aff_unadj[which(aff_unadj$P < 0.0000001),]

gws_unadj

gws_adjusted &eq#61; aff_C1C2[which(aff_C1C2$P < 0.0000001),]

gws_adjusted

===RV-TDT===

### Variant Annotation

vtools init rvtdt

vtools import --format vcf data/data.vcf --build hg19

vtools phenotype --from_file data/phen.txt

vtools execute ANNOVAR geneanno

vtools select variant "variant.region_type like '%splicing%'or variant.mut_type like 'nonsynonymous%' or variant.mut_type like 'frameshift%' or variant.mut_type like 'stop%'" -t func_variant

vtools export func_variant --format tped --samples 'phenotype is not null' > vat_raw.tped

# set marker name as chr_pos, needs to avoid duplicate name

sort -k4 -n vat_raw.tped | awk 'BEGIN{OFS="\t";prev="None";copy=1} {$2=$1"_"$4; $3=0; if($2==prev) {$2=$2"_"copy; copy=copy+1} else {prev=$2; copy=1}; print $0}' > vat_export.tped

vtools phenotype --out family sample_name pid mid sex phenotype > vat_export.tfam

vtools use refGene-hg19_20130904

vtools update func_variant --set 'maf=0.001' # set the maf to be 0.001

vtools select func_variant -o chr pos refGene.name2 maf --header > vat_export.anno

### Phasing Trio

plink --noweb --tfile vat_export --recode12 --me 1 1 --set-me-missing --out "recode12_noME"

sort -n -k1 -k6 -k2 recode12_noME.ped | sed 's/ /\t/g' | cut -f1,3,4,5 --complement > linkage.ped cut -f2 recode12_noME.map | awk 'BEGIN{OFS="\t";} {print "M",$0}' | sed '1i\I\tid\nA\tDisease' > linkage.dat

java -Xmx10000m -jar java/linkage2beagle.jar linkage.dat linkage.ped > pre_beagle.bgl

python script/pre_phase.py -i pre_beagle.bgl -a pre_beagle_withMissing.bgl

java -Xmx10000m -jar java/beagle.jar missing=0 trios=pre_beagle.bgl out=bgl_phased verbose=false redundant=true

gunzip bgl_phased.pre_beagle.bgl.phased.gz

### RV-TDT Analysis

python script/post_phase.py -a vat_export.anno -b bgl_phased.pre_beagle.bgl.phased -o genes/

for g in `ls genes | grep tped | cut -d"." -f1 | head -20`

echo "running rvTDT on gene "${g}

rvTDT exercise_proj -G ./genes/${g}.tped -P ./data/rvtdt.phen -M ./genes/${g}.map --adapt 500 --alpha 0.00001 --permut 2000 --lower_cutoff 0 --upper_cutoff 100 --minVariants 3 --maxMissRatio 1 done

done

===Seqspark===

hdfs dfs -put demo.vcf.bz2

hdfs dfs -put demo.tsv

seqspark annotation.conf

seqspark qc.conf

seqspark demo.conf

===VAT===

head GenotypeSummary.txt

vtools output variant "max(DP)" "min(DP)" "avg(DP)" "stdev(DP)" "lower_quartile(DP)" "upper_quartile(DP)" --header

vtools select variant "filter~~&eq;~~=’PASS’" --count vtools select variant "filter~~&eq;~~=’PASS’" -o "max(DP)" "min(DP)" "avg(DP)" "stdev(DP)" "lower_quartile(DP)" "upper_quartile(DP)" --header vtools update variant --from_stat ’total~~&eq;~~=#(GT)’ ’num~~&eq;~~=#(alt)’ ’het~~&eq;~~=#(het)’ ’hom~~&eq;~~=#(hom)’ ’other~~&eq;~~=#(other)’ ’minDP~~&eq;~~=min(DP_geno)’ ’maxDP~~&eq;~~=max(DP_geno)’ ’meanDP~~&eq;~~=avg(DP_geno)’ ’maf~~&eq;~~=maf()’

vtools show fields

vtools show table variant

vtools update variant --from_stat ’totalGD10~~&eq;~~=#(GT)’ ’numGD10~~&eq;~~=#(alt)’ ’hetGD10~~&eq;~~=#(het)’ ’homGD10~~&eq;~~=#(hom)’ ’otherGD10~~&eq;~~=#(other)’ ’mafGD10~~&eq;~~=maf()’ --genotypes "DP_geno > 10"

vtools show fields

vtools show table variant

vtools output variant chr pos maf mafGD10 --header --limit 20

vtools phenotype --set "RACE~~&eq;~~=0" --samples "filename like ’YRI%’" vtools phenotype --set "RACE~~&eq;~~=1" --samples "filename like ’CEU%’"

vtools show samples --limit 10

vtools update variant --from_stat ’CEU_mafGD10~~&eq;~~=maf()’ --genotypes ’DP_geno>10’ --samples "RACE~~&eq;~~=1" vtools update variant --from_stat ’YRI_mafGD10~~&eq;~~=maf()’ --genotypes ’DP_geno>10’ --samples "RACE~~&eq;~~=0"

vtools output variant chr pos mafGD10 CEU_mafGD10 YRI_mafGD10 --header --limit 10

vtools phenotype --from_stat ’CEU_totalGD10~~&eq;~~=#(GT)’ ’CEU_numGD10~~&eq;~~=#(alt)’ --genotypes ’DP_geno>10’ --samples "RACE~~&eq;~~=1" vtools phenotype --from_stat ’YRI_totalGD10~~&eq;~~=#(GT)’ ’YRI_numGD10~~&eq;~~=#(alt)’ --genotypes ’DP_geno>10’ --samples "RACE~~&eq;~~=0"

vtools phenotype --output sample_nameCEU_totalGD10CEU_numGD10YRI_totalGD10YRI_numGD10 --header

vtools execute ANNOVAR geneanno

vtools show tables

vtools remove genotypes "DP_geno<10" -v0

vtools select variant "mut_type like ’non%’ or mut_type like ’stop%’ or region_type~~&eq;~~=’splicing’" -t v_funct

vtools show tables

vtools show samples --limit 5

vtools select variant --samples "RACE~~&eq;~~=1" -t CEU

mkdir -p ceu

cd ceu

vtools init ceu --parent ../ --variants CEU --samples "RACE~~&eq;~~=1" --build hg19 vtools show project vtools select variant "CEU_mafGD10>~~&eq;~~=0.05" -t common_ceu

vtools select v_funct "CEU_mafGD10<0.01" -t rare_ceu

vtools use refGene

less EA_RV_VT.asso.res

sort -g -k6 EA_RV_VT.asso.res | head

vtools select rare_ceu "refGene.name2~~&eq;~~=’ABCC1’" -o chr pos ref alt CEU_mafGD10 numGD10 mut_type --header

cd ..

vtools select variant --samples "RACE~~&eq;~~=0" -t YRI

mkdir -p yri

cd yri

vtools init yri --parent ../ --variants YRI --samples "RACE~~&eq;~~=0" --build hg19 vtools select variant "YRI_mafGD10>~~&eq;~~=0.05" -t common_yri vtools select v_funct "YRI_mafGD10<0.01" -t rare_yri

vtools use refGene

vtools associate common_yri BMI --covariate SEX -m "LinRegBurden --alternative 2" -j1 --to_db YA_CV > YA_CV.asso.res

Serveradmin

Bureaucrat, administrator

1,252

edits