Difference between revisions of "AdvGeneMap2018Commands"
From Statistical Genetics Courses
Serveradmin (Talk | contribs) |
Serveradmin (Talk | contribs) (→GxG Interaction) |
||
| (28 intermediate revisions by the same user not shown) | |||
| Line 1: | Line 1: | ||
__NOTITLE__ | __NOTITLE__ | ||
| + | __FORCETOC__ | ||
| − | |||
| − | + | ||
| − | + | ===Functional Annotation=== | |
| − | + | table_annovar.pl | |
| + | table_annovar.pl APOC3.vcf humandb/ -buildver hg19 -out APOC3_Gene.vcf -remove -nastring . -protocol refGene -operation g -vcfinput | ||
| + | cat APOC3_Gene.vcf.hg19_multianno.txt | ||
| + | table_annovar.pl APOC3.vcf humandb/ -buildver hg19 -out APOC3_Gene.vcf -remove -nastring . -protocol refGene,knownGene,ensGene -operation g,g,g -arg '-splicing 12 -exonicsplicing','-splicing 12 -exonicsplicing','-splicing 12 -exonicsplicing' -vcfinput | ||
| + | awk -F'\t' '{print $1,$2,$6,$7,$8,$9,$10}' APOC3_Gene.vcf.hg19_multianno.txt | ||
| + | table_annovar.pl APOC3.vcf humandb/ -buildver hg19 -out APOC3_Region.vcf -remove -nastring . -protocol phastConsElements46way -operation r -vcfinput | ||
| + | table_annovar.pl APOC3.vcf humandb/ -buildver hg19 -out APOC3_Region.vcf -remove -nastring . -protocol gwasCatalog -operation r -vcfinput | ||
| + | table_annovar.pl APOC3.vcf humandb/ -buildver hg19 -out APOC3_Filter.vcf -remove -nastring . -protocol gnomad_genome,gnomad_exome,popfreq_max_20150413,gme,avsnp150,dbnsfp33a,dbscsnv11,cadd13gt20,clinvar_20170905,gwava -operation f,f,f,f,f,f,f,f,f,f -vcfinput | ||
| + | awk -F'\t' '{print $1,$2,$103,$104}' APOC3_Filter.vcf.hg19_multianno.txt | ||
| + | awk -F'\t' '{print $1,$2,$6,$14}' APOC3_Filter.vcf.hg19_multianno.txt | ||
| + | awk -F'\t' '{print $1,$2,$15,$16,$17,$18,$19,$20,$21,$22}' APOC3_Filter.vcf.hg19_multianno.txt | ||
| + | awk -F'\t' '{print $1,$2,$36,$86,$70}' APOC3_Filter.vcf.hg19_multianno.txt | ||
| + | awk -F'\t' '{print $1,$2,$99,$100}' APOC3_Filter.vcf.hg19_multianno.txt | ||
| + | table_annovar.pl APOC3.vcf humandb/ -buildver hg19 -out APOC3_ANN.vcf -remove -nastring . -protocol refGene,knownGene,ensGene,wgRna,targetScanS,phastConsElements46way,tfbsConsSites,gwasCatalog,gnomad_genome,gnomad_exome,popfreq_max_20150413,gme,avsnp150,dbnsfp33a,dbscsnv11,cadd13gt20,clinvar_20170905,gwava -operation g,g,g,r,r,r,r,r,f,f,f,f,f,f,f,f,f,f -arg '-splicing 12 -exonicsplicing','-splicing 12 -exonicsplicing','-splicing 12 -exonicsplicing',,,,,,,,,,,,,,, -vcfinput | ||
| + | |||
| + | ===GenABEL=== | ||
| + | # Load files | ||
library(GenABEL) | library(GenABEL) | ||
convert.snp.tped(tped = "gwa_gabel_qtl.tped", tfam = "gwa_gabel_qtl.tfam", out = "gwa_gabel_qtl.raw", strand = "u") | convert.snp.tped(tped = "gwa_gabel_qtl.tped", tfam = "gwa_gabel_qtl.tfam", out = "gwa_gabel_qtl.raw", strand = "u") | ||
| Line 12: | Line 28: | ||
slotNames(g.dat@gtdata) | slotNames(g.dat@gtdata) | ||
colnames(g.dat@phdata) | colnames(g.dat@phdata) | ||
| + | # sample size | ||
sample.size <- g.dat@gtdata@nids | sample.size <- g.dat@gtdata@nids | ||
| + | # number of SNPs | ||
snps.total <- g.dat@gtdata@nsnps | snps.total <- g.dat@gtdata@nsnps | ||
| − | print(c(sample.size, snps.total)) | + | print(c(sample.size, snps.total)) |
| + | # Trait | ||
summary(g.dat@phdata$disease) | summary(g.dat@phdata$disease) | ||
| − | hist(g.dat@phdata$disease, main="Quantitative Phenotype data summary", xlab = "Systolic pressure", freq = F,breaks=20, col="gray") | + | hist(g.dat@phdata$disease, main="Quantitative Phenotype data summary", xlab = "Systolic pressure measure", freq = F,breaks=20, col="gray") |
| − | rug(g.dat@phdata$disease) | + | rug(g.dat@phdata$disease) |
| + | ### | ||
| + | # tests for association | ||
| + | ### | ||
| + | # GLM test | ||
test.snp <- scan.glm('disease ~ CRSNP', family = gaussian(), data = g.dat) | test.snp <- scan.glm('disease ~ CRSNP', family = gaussian(), data = g.dat) | ||
| − | names(test.snp) | + | names(test.snp) |
| − | alpha <- 5e-8 | + | alpha <- 5e-8 |
test.snp$snpnames[test.snp$P1df < alpha] | test.snp$snpnames[test.snp$P1df < alpha] | ||
test.snp$P1df[test.snp$P1df < alpha] | test.snp$P1df[test.snp$P1df < alpha] | ||
| + | # Score test | ||
test.qt <- qtscore(disease, data = g.dat, trait = "gaussian") | test.qt <- qtscore(disease, data = g.dat, trait = "gaussian") | ||
slotNames(test.qt) | slotNames(test.qt) | ||
names(test.qt@results) | names(test.qt@results) | ||
| − | |||
test.qt@lambda | test.qt@lambda | ||
descriptives.scan(test.qt) | descriptives.scan(test.qt) | ||
| − | + | rownames(results(test.qt))[results(test.qt)$P1df < alpha] | |
| − | results(test.qt)$P1df[results(test.qt)$P1df < alpha] results(test.qt)$Pc1df[results(test.qt)$Pc1df < alpha] | + | results(test.qt)$P1df[results(test.qt)$P1df < alpha] |
| + | results(test.qt)$Pc1df[results(test.qt)$Pc1df < alpha] | ||
| + | # QQ plot | ||
obs <- sort(results(test.qt)$P1df) | obs <- sort(results(test.qt)$P1df) | ||
| − | ept <- | + | ept <- c(1:length(obs)) / (length(obs) + 1) |
plot(-log10(ept), -log10(obs), main = "GWAS QQ plot, qtl", xlab="Expected -log10(pvalue)", ylab="Observed -log10(pvalue)") | plot(-log10(ept), -log10(obs), main = "GWAS QQ plot, qtl", xlab="Expected -log10(pvalue)", ylab="Observed -log10(pvalue)") | ||
abline(0, 1, col = "red") | abline(0, 1, col = "red") | ||
abline(h = 8, lty = 2) | abline(h = 8, lty = 2) | ||
| + | # Manhattan plot | ||
plot(test.qt, col = "black") | plot(test.qt, col = "black") | ||
| + | # Adding confounders | ||
test.qt.sex <- qtscore(disease ~ sex, data = g.dat, trait = "gaussian") | test.qt.sex <- qtscore(disease ~ sex, data = g.dat, trait = "gaussian") | ||
| − | + | rownames(results(test.qt.sex))[results(test.qt)$P1df < alpha] | |
summary(lm(disease ~ sex, data = g.dat)) | summary(lm(disease ~ sex, data = g.dat)) | ||
| − | + | ### | |
| − | + | # MDS | |
| − | + | ### | |
| − | + | ||
| − | + | ||
| − | + | ||
| − | + | ||
| − | + | ||
| − | + | ||
| − | + | ||
| − | + | ||
| − | + | ||
| − | + | ||
| − | + | ||
| − | + | ||
| − | + | ||
| − | + | ||
| − | + | ||
| − | + | ||
| − | + | ||
| − | + | ||
| − | + | ||
gkin <- ibs(g.dat, weight = "freq") | gkin <- ibs(g.dat, weight = "freq") | ||
gkin[1:10,1:10] | gkin[1:10,1:10] | ||
cps.full <- cmdscale(as.dist(.5 - gkin), eig = T, k = 10) | cps.full <- cmdscale(as.dist(.5 - gkin), eig = T, k = 10) | ||
| − | names(cps.full) | + | names(cps.full) |
| − | cps <- cps.full$points | + | cps <- cps.full$points |
plot(cps[,1], cps[,2], pch = g.dat@phdata$popn) | plot(cps[,1], cps[,2], pch = g.dat@phdata$popn) | ||
| − | legend( | + | legend(-0.16, 0.06, c("TSI","MEX", "CEU"), pch = c(1,2,3)) |
| + | ### | ||
| + | # Corrected test | ||
| + | ### | ||
| + | # Incorporating PCs as predictors | ||
colnames(cps)<-c('C1','C2','C3','C4','C5','C6','C7','C8','C9','C10') | colnames(cps)<-c('C1','C2','C3','C4','C5','C6','C7','C8','C9','C10') | ||
gpc.dat <- g.dat | gpc.dat <- g.dat | ||
gpc.dat@phdata<-cbind(g.dat@phdata, cps) | gpc.dat@phdata<-cbind(g.dat@phdata, cps) | ||
| − | test.pc.a <- scan.glm('disease ~ CRSNP + C1 + C2 + C3 + C4 + C5', family=gaussian(), data = gpc.dat) | + | test.pc.a <- scan.glm('disease ~ CRSNP + C1 + C2 + C3 + C4 + C5', family=gaussian(), data = gpc.dat) |
test.pc.a$snpnames[test.pc.a$P1df < alpha] | test.pc.a$snpnames[test.pc.a$P1df < alpha] | ||
test.pc.a$P1df[test.pc.a$P1df < alpha] | test.pc.a$P1df[test.pc.a$P1df < alpha] | ||
| − | test.pc.b <- qtscore(disease ~ C1 + C2 + C3 + C4 + C5, data = gpc.dat, trait = "gaussian") | + | test.pc.b <- qtscore(disease ~ C1 + C2 + C3 + C4 + C5, data = gpc.dat, trait = "gaussian") |
test.pc.b@lambda | test.pc.b@lambda | ||
| + | # scree plot | ||
plot(cps.full$eig[1:10]/sum(cps.full$eig), axes = F, type = "b", xlab = "Components", ylim = c(0,0.05), ylab = "Proportion of Variations", main = "MDS analysis scree plot") | plot(cps.full$eig[1:10]/sum(cps.full$eig), axes = F, type = "b", xlab = "Components", ylim = c(0,0.05), ylab = "Proportion of Variations", main = "MDS analysis scree plot") | ||
axis(1, 1:10) | axis(1, 1:10) | ||
axis(2) | axis(2) | ||
| + | # cumulative plot | ||
plot(cumsum(cps.full$eig[1:10])/sum(cps.full$eig), axes = F, type = "b", ylim = c(0,0.2), xlab = "Components", ylab = "Proportion of Variations", main = "MDS analysis cumulative plot") | plot(cumsum(cps.full$eig[1:10])/sum(cps.full$eig), axes = F, type = "b", ylim = c(0,0.2), xlab = "Components", ylab = "Proportion of Variations", main = "MDS analysis cumulative plot") | ||
axis(1, 1:10) | axis(1, 1:10) | ||
axis(2) | axis(2) | ||
| + | # Genomic control | ||
| + | # Uncorrected GIF | ||
| + | test.qt@lambda | ||
| + | # Corrected p-value | ||
row.names(results(test.qt))[results(test.qt)$Pc1df < alpha] | row.names(results(test.qt))[results(test.qt)$Pc1df < alpha] | ||
results(test.qt)$Pc1df[results(test.qt)$Pc1df < alpha] | results(test.qt)$Pc1df[results(test.qt)$Pc1df < alpha] | ||
| − | + | # Check for inflation of statistic | |
obs <- sort(results(test.qt)$chi2.1df) | obs <- sort(results(test.qt)$chi2.1df) | ||
| − | ept <- sort(qchisq( | + | ept <- sort(qchisq(1:length(obs) / (length(obs) + 1), df = 1)) |
| − | plot(ept, obs, main = "Genomic control ( | + | plot(ept, obs, main = "Genomic control (slope is the inflation factor)", xlab="Expected chisq, 1df", ylab="Observed chisq, 1df") |
abline(0, 1, col = "red") | abline(0, 1, col = "red") | ||
abline(0, test.qt@lambda[1], lty = 2) | abline(0, test.qt@lambda[1], lty = 2) | ||
| + | # Definition of GIF | ||
| + | # Conventional definition | ||
median(results(test.qt)$chi2.1df)/0.456 | median(results(test.qt)$chi2.1df)/0.456 | ||
| + | # GenABEL definition | ||
| + | lm(obs~ept)$coef[2] | ||
| + | # QQ plot | ||
obs <- sort(results(test.qt)$Pc1df) | obs <- sort(results(test.qt)$Pc1df) | ||
| − | ept <- | + | ept <- c(1:length(obs)) / (length(obs) + 1) |
plot(-log10(ept), -log10(obs), main = "GWAS QQ plot adj. via Genomic Control", xlab="Expected -log10(pvalue)", ylab="Observed -log10(pvalue)") | plot(-log10(ept), -log10(obs), main = "GWAS QQ plot adj. via Genomic Control", xlab="Expected -log10(pvalue)", ylab="Observed -log10(pvalue)") | ||
abline(0, 1, col = "red") | abline(0, 1, col = "red") | ||
| − | abline(h = 8, lty = 2) | + | abline(h = 8, lty = 2) |
| + | # EIGENSTRAT | ||
adj.gkin = gkin | adj.gkin = gkin | ||
diag(adj.gkin) = hom(g.dat)$Var | diag(adj.gkin) = hom(g.dat)$Var | ||
| + | # naxes = 3 is default value | ||
test.eg <- egscore(disease, data = g.dat, kin = adj.gkin, naxes = 2) | test.eg <- egscore(disease, data = g.dat, kin = adj.gkin, naxes = 2) | ||
descriptives.scan(test.eg) | descriptives.scan(test.eg) | ||
snp.eg <- row.names(results(test.eg))[results(test.eg)$P1df < alpha] | snp.eg <- row.names(results(test.eg))[results(test.eg)$P1df < alpha] | ||
| − | pvalue.eg <- results(test.eg)$P1df[results(test.eg)$P1df < alpha] lambda.eg <- test.eg@lambda snp.eg | + | pvalue.eg <- results(test.eg)$P1df[results(test.eg)$P1df < alpha] |
| + | lambda.eg <- test.eg@lambda | ||
| + | snp.eg | ||
pvalue.eg | pvalue.eg | ||
lambda.eg | lambda.eg | ||
| − | for (k in 1:10){ | + | # Change #PCs |
| − | + | for (k in 1:10){ | |
| + | test.tmp <- egscore(disease, data = g.dat, kin = adj.gkin, naxes = k) | ||
print(test.tmp@lambda$estimate) | print(test.tmp@lambda$estimate) | ||
} | } | ||
| + | # QQ plot | ||
obs <- sort(results(test.eg)$Pc1df) | obs <- sort(results(test.eg)$Pc1df) | ||
| − | ept <- | + | ept <- c(1:length(obs)) / (length(obs) + 1) |
| − | + | qqplot(-log10(ept), -log10(obs), main = "GWAS QQ plot adj. w/ EIGENSTRAT", xlab="Expected -log10(pvalue)", ylab="Observed -log10(pvalue)") | |
abline(0, 1, col = "red") | abline(0, 1, col = "red") | ||
abline(h = 8, lty = 2) | abline(h = 8, lty = 2) | ||
| + | # Manhattan plot comparison | ||
plot(test.qt, col = "black") | plot(test.qt, col = "black") | ||
add.plot(test.eg, col = "gray", pch = 3) | add.plot(test.eg, col = "gray", pch = 3) | ||
| − | legend("topright", c("Original plot","After correction w/ EIGENSTRAT"), pch = c(1,3))== | + | legend("topright", c("Original plot","After correction w/ EIGENSTRAT"), pch = c(1,3)) |
| + | ### | ||
| + | # Basic test, binary trait | ||
| + | ### | ||
| + | # load files to GenABEL | ||
| + | convert.snp.tped(tped = "gwa_gabel.tped", tfam = "gwa_gabel.tfam", out = "gwa_gabel.raw", strand = "u") | ||
| + | b.dat <- load.gwaa.data(phen = "gwa_gabel.praw", gen = "gwa_gabel.raw", force = T) | ||
| + | slotNames(b.dat) | ||
| + | slotNames(b.dat@gtdata) | ||
| + | colnames(b.dat@phdata) | ||
| + | # sample size | ||
| + | b.dat@gtdata@nids | ||
| + | # number of cases and controls | ||
| + | case.size <- length(which(b.dat@phdata$disease == 1)) | ||
| + | control.size <- length(which(b.dat@phdata$disease == 0)) | ||
| + | case.size | ||
| + | control.size | ||
| + | # number of SNPs | ||
| + | snpsb.total <- b.dat@gtdata@nsnps | ||
| + | # GLM test | ||
| + | testb.snp <- scan.glm('disease ~ CRSNP', family = binomial(), data = b.dat) | ||
| + | names(testb.snp) | ||
| + | alpha <- 5e-8 | ||
| + | testb.snp$snpnames[testb.snp$P1df < alpha] | ||
| + | testb.snp$P1df[testb.snp$P1df < alpha] | ||
| + | # Score test | ||
| + | testb.qt <- qtscore(disease, data = b.dat, trait = "binomial") | ||
| + | slotNames(testb.qt) | ||
| + | descriptives.scan(testb.qt) | ||
| + | row.names(results(testb.qt))[results(testb.qt)$P1df < alpha] | ||
| + | results(testb.qt)$P1df[results(testb.qt)$P1df < alpha] | ||
| + | results(testb.qt)$Pc1df[results(testb.qt)$Pc1df < alpha] | ||
| + | |||
| + | |||
| + | ===GxG Interaction=== | ||
| − | plink | + | ./plink --noweb --ped simcasecon.ped --map simcasecon.map --assoc |
| − | + | ./plink --noweb --ped simcasecon.ped --map simcasecon.map --fast-epistasis | |
| − | plink -- | + | ./plink --noweb --ped simcasecon.ped --map simcasecon.map --fast-epistasis --case-only |
| − | plink -- | + | ./plink --noweb --ped simcasecon.ped --map simcasecon.map --epistasis |
| − | + | ./plink --noweb --ped simcasecon.ped --map simcasecon.map --recodeA --out recoded | |
| − | plink -- | + | ./plink --noweb --ped simcasecon.ped --map simcasecon.map --make-bed --out cassiformat |
| − | plink -- | + | |
| − | plink -- | + | |
| − | + | ||
R | R | ||
| − | + | # The following commands are in the R environment | |
| + | je <-read.table("cassi.out", header=T) | ||
| + | je | ||
| + | library(ORMDR) | ||
| + | recoded<-read.table("recoded.raw", header=T) | ||
| + | head(recoded) | ||
| + | newdata<-recoded[7:106] | ||
| + | ormdrdata<-cbind(newdata,recoded$PHENOTYPE-1) | ||
| + | names(ormdrdata)[101]<-"casestatus" | ||
| + | head(ormdrdata) | ||
| + | mdr1<-mdr.c(ormdrdata, colresp=101, cs=1, combi=1, cv.fold = 10) | ||
| + | mdr1$min.comb | ||
| + | mdr2<-mdr.c(ormdrdata, colresp=101, cs=1, combi=2, cv.fold = 10) | ||
| + | mdr2$min.comb | ||
| + | mdr3<-mdr.c(ormdrdata, colresp=101, cs=1, combi=3, cv.fold = 10) | ||
| + | mdr3$min.comb | ||
| + | mdr1$test.erate | ||
| + | mdr2$test.erate | ||
| + | mdr3$test.erate | ||
| + | mdr1mean<-mean(mdr1$test.erate) | ||
| + | mdr2mean<-mean(mdr2$test.erate) | ||
| + | mdr3mean<-mean(mdr3$test.erate) | ||
| + | mdr1mean | ||
| + | mdr2mean | ||
| + | mdr3mean | ||
| + | mdr2$best.combi | ||
| + | mdr2$min.comb | ||
| + | mdr3$best.combi | ||
| + | mdr3$min.comb | ||
| + | logreg12<-glm(casestatus ~ factor(snp1_2)*factor(snp2_1), family=binomial, | ||
| + | data=ormdrdata) | ||
| + | summary(logreg12) | ||
| + | anova(logreg12) | ||
| + | pchisq(701.68,4,lower.tail=F) | ||
| + | pchisq(703.82,8,lower.tail=F) | ||
| + | logreg345<-glm(casestatus ~ factor(snp3_2)*factor(snp4_2)*factor(snp5_2), | ||
| + | family=binomial, data=ormdrdata) | ||
| + | summary(logreg345) | ||
| + | anova(logreg345) | ||
| + | pchisq(45.6,8,lower.tail=F) | ||
| + | q() | ||
| + | ### The following commands are in the linux shell | ||
| + | ./BEAM3 beam3data.txt -o beam3results | ||
| + | ./BEAM3 beam3data.txt -o beam3results -T 10 | ||
| + | |||
| + | ===Plink - Part 1 - Data QC=== | ||
| + | |||
| + | plink --file GWAS | ||
| + | plink --file GWAS --mind 0.10 --recode --out GWAS_clean_mind | ||
| + | plink --file GWAS_clean_mind --maf 0.05 --recode --out MAF_greater_5 | ||
| + | plink --file GWAS_clean_mind --exclude MAF_greater_5.map --recode --out MAF_less_5 | ||
| + | plink --file MAF_greater_5 --geno 0.05 --recode --out MAF_greater_5_clean | ||
| + | plink --file MAF_less_5 --geno 0.01 --recode --out MAF_less_5_clean | ||
| + | plink --file MAF_greater_5_clean --merge MAF_less_5_clean.ped MAF_less_5_clean.map --recode --out GWAS_MAF_clean | ||
| + | plink --file GWAS_MAF_clean --mind 0.03 --recode --out GWAS_clean2 | ||
| + | plink --file GWAS_clean2 --check-sex --out GWAS_sex_checking | ||
| + | #### in R - open R by simply typing R | ||
| + | setwd("to_your_working_directory/") | ||
| + | sexcheck = read.table("GWAS_sex_checking.sexcheck", header=T) | ||
names(sexcheck) | names(sexcheck) | ||
| − | sex_problem | + | sex_problem = sexcheck[which(sexcheck$STATUS=="PROBLEM"),] |
sex_problem | sex_problem | ||
q() | q() | ||
| − | plink --file GWAS_clean2 --genome --out duplicates | + | ################################## |
| − | R | + | plink --file GWAS_clean2 --genome --out duplicates |
| − | dups | + | #### in R |
| − | problem_pairs | + | setwd("to_your_working_directory/") |
| + | dups = read.table("duplicates.genome", header = T) | ||
| + | problem_pairs = dups[which(dups$PI_HAT > 0.4),] | ||
problem_pairs | problem_pairs | ||
| − | problem_pairs | + | problem_pairs = dups[which(dups$PI_HAT > 0.05),] |
| − | myvars | + | myvars = c("FID1", "IID1", "FID2", "IID2", "PI_HAT") |
problem_pairs[myvars] | problem_pairs[myvars] | ||
q() | q() | ||
| − | plink --file GWAS_clean2 --remove IBS_excluded.txt --recode --out GWAS_clean3 | + | ###### |
| − | plink --file GWAS_clean3 --het | + | plink --file GWAS_clean2 --remove IBS_excluded.txt --recode --out GWAS_clean3 |
| − | R | + | plink --file GWAS_clean3 --het |
| − | Dataset <- read.table("plink.het", header | + | ###### in R |
| − | + | Dataset <- read.table("plink.het", header=TRUE, sep="", na.strings="NA", dec=".", strip.white=TRUE) | |
mean(Dataset$F) | mean(Dataset$F) | ||
sd(Dataset$F) | sd(Dataset$F) | ||
| − | jpeg("hist.jpeg", height | + | jpeg("hist.jpeg", height=1000, width=1000) |
| − | hist(scale(Dataset$F), xlim | + | hist(scale(Dataset$F), xlim=c(-4,4)) |
dev.off() | dev.off() | ||
q() | q() | ||
| − | plink --file GWAS_clean3 --pheno pheno.txt --pheno-name Aff --hardy | + | ###### |
| − | R | + | plink --file GWAS_clean3 --pheno pheno.txt --pheno-name Aff --hardy |
| − | hardy | + | ##### in R |
| + | hardy = read.table("plink.hwe", header = T) | ||
names(hardy) | names(hardy) | ||
| − | hwe_prob | + | hwe_prob = hardy[which(hardy$P < 0.0000009),] |
hwe_prob | hwe_prob | ||
q() | q() | ||
| − | plink --file GWAS_clean3 --exclude HWE_out.txt --recode --out GWAS_clean4 | + | ########## |
| − | plink --file GWAS_clean4 --genome --mds-plot 10 | + | plink --file GWAS_clean3 --exclude HWE_out.txt --recode --out GWAS_clean4 |
| − | R | + | |
| − | mydata | + | |
| − | mydata$pch[mydata$Group | + | ===Plink - Part 2 - Controlling for Substructure=== |
| − | mydata$pch[mydata$Group | + | |
| − | mydata$pch[mydata$Group | + | plink --file GWAS_clean4 --genome --cluster --mds-plot 10 |
| − | jpeg("mds.jpeg", height | + | #### in R |
| − | plot(mydata$C1, mydata$C2 ,pch | + | mydata = read.table("mds_components.txt", header=T) |
| + | mydata$pch[mydata$Group==1 ] <-15 | ||
| + | mydata$pch[mydata$Group==2 ] <-16 | ||
| + | mydata$pch[mydata$Group==3 ] <-2 | ||
| + | jpeg("mds.jpeg", height=500, width=500) | ||
| + | plot(mydata$C1, mydata$C2 ,pch=mydata$pch) | ||
dev.off() | dev.off() | ||
q() | q() | ||
| − | plink --file GWAS_clean4 --pheno pheno.txt --pheno-name Aff --logistic --adjust --out unadj -- | + | ###### |
| − | plink --file GWAS_clean4 --pheno pheno.txt --pheno-name Aff --covar plink. | + | plink --file GWAS_clean4 --pheno pheno.txt --pheno-name Aff --logistic --adjust --out unadj |
| − | plink --file GWAS_clean4 --pheno pheno.txt --pheno-name Aff --covar plink. | + | plink --file GWAS_clean4 --genome --cluster --pca 10 header |
| − | R | + | plink --file GWAS_clean4 --pheno pheno.txt --pheno-name Aff --covar plink.eigenvec --covar-name PC1 --logistic --adjust --out PC1 |
| + | plink --file GWAS_clean4 --pheno pheno.txt --pheno-name Aff --covar plink.eigenvec --covar-name PC1-PC2 --logistic --adjust --out PC1-PC2 | ||
| + | #### in R | ||
broadqq <-function(pvals, title) | broadqq <-function(pvals, title) | ||
{ | { | ||
| − | + | observed <- sort(pvals) | |
| − | + | lobs <- -(log10(observed)) | |
| − | + | expected <- c(1:length(observed)) | |
| − | + | lexp <- -(log10(expected / (length(expected)+1))) | |
| − | + | plot(c(0,7), c(0,7), col="red", lwd=3, type="l", xlab="Expected (-logP)", ylab="Observed (-logP)", xlim=c(0,max(lobs)), ylim=c(0,max(lobs)), las=1, xaxs="i", yaxs="i", bty="l", main = title) | |
| − | + | points(lexp, lobs, pch=23, cex=.4, bg="black") } | |
| − | jpeg("qqplot_compare.jpeg", height | + | jpeg("qqplot_compare.jpeg", height=1000, width=500) |
| − | par(mfrow | + | par(mfrow=c(2,1)) |
| − | aff_unadj<-read.table("unadj.assoc.logistic", header | + | aff_unadj<-read.table("unadj.assoc.logistic", header=TRUE) |
| − | aff_unadj.add.p<-aff_unadj[aff_unadj$TEST | + | aff_unadj.add.p<-aff_unadj[aff_unadj$TEST==c("ADD"),]$P |
broadqq(aff_unadj.add.p,"Some Trait Unadjusted") | broadqq(aff_unadj.add.p,"Some Trait Unadjusted") | ||
| − | aff_C1C2<-read.table(" | + | aff_C1C2<-read.table("PC1-PC2.assoc.logistic", header=TRUE) |
| − | aff_C1C2.add.p<-aff_C1C2[aff_C1C2$TEST | + | aff_C1C2.add.p<-aff_C1C2[aff_C1C2$TEST==c("ADD"),]$P |
| − | broadqq(aff_C1C2.add.p, "Some Trait Adjusted") | + | broadqq(aff_C1C2.add.p, "Some Trait Adjusted for PC1 and PC2") |
dev.off() | dev.off() | ||
| − | gws_unadj | + | gws_unadj = aff_unadj[which(aff_unadj$P < 0.0000001),] |
gws_unadj | gws_unadj | ||
| − | gws_adjusted | + | gws_adjusted = aff_C1C2[which(aff_C1C2$P < 0.0000001),] |
gws_adjusted | gws_adjusted | ||
| − | + | ||
| − | ==VAT== | + | |
| + | |||
| + | |||
| + | ===RV-TDT=== | ||
| + | ### Variant Annotation | ||
| + | vtools init rvtdt | ||
| + | vtools import --format vcf data/data.vcf --build hg19 | ||
| + | vtools phenotype --from_file data/phen.txt | ||
| + | vtools execute ANNOVAR geneanno | ||
| + | vtools select variant "variant.region_type like '%splicing%'or variant.mut_type like 'nonsynonymous%' or variant.mut_type like 'frameshift%' or variant.mut_type like 'stop%'" -t func_variant | ||
| + | vtools export func_variant --format tped --samples 'phenotype is not null' > vat_raw.tped | ||
| + | # set marker name as chr_pos, needs to avoid duplicate name | ||
| + | sort -k4 -n vat_raw.tped | awk 'BEGIN{OFS="\t";prev="None";copy=1} {$2=$1"_"$4; $3=0; if($2==prev) {$2=$2"_"copy; copy=copy+1} else {prev=$2; copy=1}; print $0}' > vat_export.tped | ||
| + | vtools phenotype --out family sample_name pid mid sex phenotype > vat_export.tfam | ||
| + | vtools use refGene-hg19_20130904 | ||
| + | vtools update func_variant --set 'maf=0.001' # set the maf to be 0.001 | ||
| + | vtools select func_variant -o chr pos refGene.name2 maf --header > vat_export.anno | ||
| + | |||
| + | ### Phasing Trio | ||
| + | plink --noweb --tfile vat_export --recode12 --me 1 1 --set-me-missing --out "recode12_noME" | ||
| + | sort -n -k1 -k6 -k2 recode12_noME.ped | sed 's/ /\t/g' | cut -f1,3,4,5 --complement > linkage.ped cut -f2 recode12_noME.map | awk 'BEGIN{OFS="\t";} {print "M",$0}' | sed '1i\I\tid\nA\tDisease' > linkage.dat | ||
| + | java -Xmx10000m -jar java/linkage2beagle.jar linkage.dat linkage.ped > pre_beagle.bgl | ||
| + | python script/pre_phase.py -i pre_beagle.bgl -a pre_beagle_withMissing.bgl | ||
| + | java -Xmx10000m -jar java/beagle.jar missing=0 trios=pre_beagle.bgl out=bgl_phased verbose=false redundant=true | ||
| + | gunzip bgl_phased.pre_beagle.bgl.phased.gz | ||
| + | |||
| + | ### RV-TDT Analysis | ||
| + | python script/post_phase.py -a vat_export.anno -b bgl_phased.pre_beagle.bgl.phased -o genes/ | ||
| + | for g in `ls genes | grep tped | cut -d"." -f1 | head -20` | ||
| + | do | ||
| + | echo "running rvTDT on gene "${g} | ||
| + | rvTDT exercise_proj -G ./genes/${g}.tped -P ./data/rvtdt.phen -M ./genes/${g}.map --adapt 500 --alpha 0.00001 --permut 2000 --lower_cutoff 0 --upper_cutoff 100 --minVariants 3 --maxMissRatio 1 done | ||
| + | done | ||
| + | |||
| + | |||
| + | |||
| + | ===Seqspark=== | ||
| + | hdfs dfs -put demo.vcf.bz2 | ||
| + | hdfs dfs -put demo.tsv | ||
| + | seqspark annotation.conf | ||
| + | seqspark qc.conf | ||
| + | seqspark demo.conf | ||
| + | |||
| + | ===VAT=== | ||
| + | |||
vtools -h | vtools -h | ||
vtools init VATDemo | vtools init VATDemo | ||
| Line 214: | Line 387: | ||
head GenotypeSummary.txt | head GenotypeSummary.txt | ||
vtools output variant "max(DP)" "min(DP)" "avg(DP)" "stdev(DP)" "lower_quartile(DP)" "upper_quartile(DP)" --header | vtools output variant "max(DP)" "min(DP)" "avg(DP)" "stdev(DP)" "lower_quartile(DP)" "upper_quartile(DP)" --header | ||
| − | vtools select variant "filter= | + | vtools select variant "filter=’PASS’" --count |
| − | vtools select variant "filter= | + | vtools select variant "filter=’PASS’" -o "max(DP)" "min(DP)" "avg(DP)" "stdev(DP)" "lower_quartile(DP)" "upper_quartile(DP)" --header |
| − | vtools update variant --from_stat | + | vtools update variant --from_stat ’total=#(GT)’ ’num=#(alt)’ ’het=#(het)’ ’hom=#(hom)’ ’other=#(other)’ ’minDP=min(DP_geno)’ ’maxDP=max(DP_geno)’ ’meanDP=avg(DP_geno)’ ’maf=maf()’ |
vtools show fields | vtools show fields | ||
vtools show table variant | vtools show table variant | ||
| − | vtools update variant --from_stat | + | vtools update variant --from_stat ’totalGD10=#(GT)’ ’numGD10=#(alt)’ ’hetGD10=#(het)’ ’homGD10=#(hom)’ ’otherGD10=#(other)’ ’mafGD10=maf()’ --genotypes "DP_geno > 10" |
vtools show fields | vtools show fields | ||
vtools show table variant | vtools show table variant | ||
vtools output variant chr pos maf mafGD10 --header --limit 20 | vtools output variant chr pos maf mafGD10 --header --limit 20 | ||
| − | vtools phenotype --set "RACE=0" --samples "filename like | + | vtools phenotype --set "RACE=0" --samples "filename like ’YRI%’" |
| − | vtools phenotype --set "RACE=1" --samples "filename like | + | vtools phenotype --set "RACE=1" --samples "filename like ’CEU%’" |
vtools show samples --limit 10 | vtools show samples --limit 10 | ||
| − | vtools update variant --from_stat | + | vtools update variant --from_stat ’CEU_mafGD10=maf()’ --genotypes ’DP_geno>10’ --samples "RACE=1" |
| − | vtools update variant --from_stat | + | vtools update variant --from_stat ’YRI_mafGD10=maf()’ --genotypes ’DP_geno>10’ --samples "RACE=0" |
vtools output variant chr pos mafGD10 CEU_mafGD10 YRI_mafGD10 --header --limit 10 | vtools output variant chr pos mafGD10 CEU_mafGD10 YRI_mafGD10 --header --limit 10 | ||
| − | vtools phenotype --from_stat | + | vtools phenotype --from_stat ’CEU_totalGD10=#(GT)’ ’CEU_numGD10=#(alt)’ --genotypes ’DP_geno>10’ --samples "RACE=1" |
| − | vtools phenotype --from_stat | + | vtools phenotype --from_stat ’YRI_totalGD10=#(GT)’ ’YRI_numGD10=#(alt)’ --genotypes ’DP_geno>10’ --samples "RACE=0" |
| − | vtools phenotype --output | + | vtools phenotype --output sample_nameCEU_totalGD10CEU_numGD10YRI_totalGD10YRI_numGD10 --header |
| − | + | ||
| − | + | ||
| − | + | ||
| − | + | ||
| − | + | ||
vtools execute ANNOVAR geneanno | vtools execute ANNOVAR geneanno | ||
vtools output variant chr pos ref alt mut_type --limit 20 --header | vtools output variant chr pos ref alt mut_type --limit 20 --header | ||
| Line 245: | Line 413: | ||
vtools remove variants to_remove -v0 | vtools remove variants to_remove -v0 | ||
vtools show tables | vtools show tables | ||
| − | vtools remove genotypes "DP_geno<10" -v0 | + | vtools remove genotypes "DP_geno<10" -v0 |
| − | + | vtools select variant "mut_type like ’non%’ or mut_type like ’stop%’ or region_type=’splicing’" -t v_funct | |
| + | vtools show tables | ||
| + | vtools show samples --limit 5 | ||
| + | vtools select variant --samples "RACE=1" -t CEU | ||
| + | mkdir -p ceu | ||
| + | cd ceu | ||
| + | vtools init ceu --parent ../ --variants CEU --samples "RACE=1" --build hg19 vtools show project | ||
vtools select variant "CEU_mafGD10>=0.05" -t common_ceu | vtools select variant "CEU_mafGD10>=0.05" -t common_ceu | ||
| − | vtools select v_funct "CEU_mafGD10<0.01" -t rare_ceu | + | vtools select v_funct "CEU_mafGD10<0.01" -t rare_ceu |
| + | vtools use refGene | ||
| + | vtools show annotation refGene | ||
| + | vtools associate -h | ||
| + | vtools show tests | ||
| + | vtools show test LinRegBurden | ||
| + | vtools associate common_ceu BMI --covariate SEX -m "LinRegBurden --alternative 2" -j1 --to_db EA_CV > EA_CV.asso.res | ||
grep -i error *.log | grep -i error *.log | ||
less EA_CV.asso.res | less EA_CV.asso.res | ||
sort -g -k7 EA_CV.asso.res | head | sort -g -k7 EA_CV.asso.res | head | ||
vtools show fields | vtools show fields | ||
| − | vtools associate rare_ceu BMI --covariate SEX -m "LinRegBurden --alternative 2" -g refGene.name2 -j1 --to_db EA_RV > EA_RV.asso.res | + | vtools associate rare_ceu BMI --covariate SEX -m "LinRegBurden --alternative 2" -g refGene.name2 -j1 --to_db EA_RV > EA_RV.asso.res |
| − | grep -i error *.log | tail - | + | grep -i error *.log | tail -10 |
less EA_RV.asso.res | less EA_RV.asso.res | ||
sort -g -k6 EA_RV.asso.res | head | sort -g -k6 EA_RV.asso.res | head | ||
vtools associate rare_ceu BMI --covariate SEX -m "VariableThresholdsQt --alternative 2 -p 100000 --adaptive 0.0005" -g refGene.name2 -j1 --to_db EA_RV > EA_RV_VT.asso.res | vtools associate rare_ceu BMI --covariate SEX -m "VariableThresholdsQt --alternative 2 -p 100000 --adaptive 0.0005" -g refGene.name2 -j1 --to_db EA_RV > EA_RV_VT.asso.res | ||
| − | grep -i error *.log | tail - | + | grep -i error *.log | tail -10 |
less EA_RV_VT.asso.res | less EA_RV_VT.asso.res | ||
sort -g -k6 EA_RV_VT.asso.res | head | sort -g -k6 EA_RV_VT.asso.res | head | ||
| − | vtools select rare_ceu "refGene.name2= | + | vtools select rare_ceu "refGene.name2=’ABCC1’" -o chr pos ref alt CEU_mafGD10 numGD10 mut_type --header |
| − | + | cd .. | |
| − | + | vtools select variant --samples "RACE=0" -t YRI | |
| − | + | mkdir -p yri | |
| − | + | cd yri | |
| − | vtools associate rare_yri BMI --covariate SEX -m "LinRegBurden --alternative 2" -g refGene.name2 -j1 --to_db YA_RV > YA_RV.asso.res | + | vtools init yri --parent ../ --variants YRI --samples "RACE=0" --build hg19 vtools select variant "YRI_mafGD10>=0.05" -t common_yri vtools select v_funct "YRI_mafGD10<0.01" -t rare_yri |
| − | + | vtools use refGene | |
| + | vtools associate common_yri BMI --covariate SEX -m "LinRegBurden --alternative 2" -j1 --to_db YA_CV > YA_CV.asso.res | ||
| + | vtools associate rare_yri BMI --covariate SEX -m "LinRegBurden --alternative 2" -g refGene.name2 -j1 --to_db YA_RV > YA_RV.asso.res vtools associate rare_yri BMI --covariate SEX -m "VariableThresholdsQt --alternative 2 -p 100000 --adaptive 0.0005" -g refGene.name2 -j1 --to_db YA_RV > YA_RV_VT.asso.res | ||
cd .. | cd .. | ||
| − | vtools_report meta_analysis ceu/EA_RV_VT.asso.res yri/YA_RV_VT.asso.res --beta 5 --pval 6 --se 7 -n 2 --link 1 > | + | vtools_report meta_analysis ceu/EA_RV_VT.asso.res yri/YA_RV_VT.asso.res --beta 5 --pval 6 --se 7 -n 2 --link 1 > ME\ TA_RV_VT.asso.res |
cut -f1,3 META_RV_VT.asso.res | head | cut -f1,3 META_RV_VT.asso.res | head | ||
Latest revision as of 19:46, 23 January 2018
Contents
Functional Annotation
table_annovar.pl
table_annovar.pl APOC3.vcf humandb/ -buildver hg19 -out APOC3_Gene.vcf -remove -nastring . -protocol refGene -operation g -vcfinput
cat APOC3_Gene.vcf.hg19_multianno.txt
table_annovar.pl APOC3.vcf humandb/ -buildver hg19 -out APOC3_Gene.vcf -remove -nastring . -protocol refGene,knownGene,ensGene -operation g,g,g -arg '-splicing 12 -exonicsplicing','-splicing 12 -exonicsplicing','-splicing 12 -exonicsplicing' -vcfinput
awk -F'\t' '{print $1,$2,$6,$7,$8,$9,$10}' APOC3_Gene.vcf.hg19_multianno.txt
table_annovar.pl APOC3.vcf humandb/ -buildver hg19 -out APOC3_Region.vcf -remove -nastring . -protocol phastConsElements46way -operation r -vcfinput
table_annovar.pl APOC3.vcf humandb/ -buildver hg19 -out APOC3_Region.vcf -remove -nastring . -protocol gwasCatalog -operation r -vcfinput
table_annovar.pl APOC3.vcf humandb/ -buildver hg19 -out APOC3_Filter.vcf -remove -nastring . -protocol gnomad_genome,gnomad_exome,popfreq_max_20150413,gme,avsnp150,dbnsfp33a,dbscsnv11,cadd13gt20,clinvar_20170905,gwava -operation f,f,f,f,f,f,f,f,f,f -vcfinput
awk -F'\t' '{print $1,$2,$103,$104}' APOC3_Filter.vcf.hg19_multianno.txt
awk -F'\t' '{print $1,$2,$6,$14}' APOC3_Filter.vcf.hg19_multianno.txt
awk -F'\t' '{print $1,$2,$15,$16,$17,$18,$19,$20,$21,$22}' APOC3_Filter.vcf.hg19_multianno.txt
awk -F'\t' '{print $1,$2,$36,$86,$70}' APOC3_Filter.vcf.hg19_multianno.txt
awk -F'\t' '{print $1,$2,$99,$100}' APOC3_Filter.vcf.hg19_multianno.txt
table_annovar.pl APOC3.vcf humandb/ -buildver hg19 -out APOC3_ANN.vcf -remove -nastring . -protocol refGene,knownGene,ensGene,wgRna,targetScanS,phastConsElements46way,tfbsConsSites,gwasCatalog,gnomad_genome,gnomad_exome,popfreq_max_20150413,gme,avsnp150,dbnsfp33a,dbscsnv11,cadd13gt20,clinvar_20170905,gwava -operation g,g,g,r,r,r,r,r,f,f,f,f,f,f,f,f,f,f -arg '-splicing 12 -exonicsplicing','-splicing 12 -exonicsplicing','-splicing 12 -exonicsplicing',,,,,,,,,,,,,,, -vcfinput
GenABEL
# Load files
library(GenABEL)
convert.snp.tped(tped = "gwa_gabel_qtl.tped", tfam = "gwa_gabel_qtl.tfam", out = "gwa_gabel_qtl.raw", strand = "u")
g.dat <- load.gwaa.data(phen = "gwa_gabel_qtl.praw", gen = "gwa_gabel_qtl.raw", force = T)
slotNames(g.dat)
slotNames(g.dat@gtdata)
colnames(g.dat@phdata)
# sample size
sample.size <- g.dat@gtdata@nids
# number of SNPs
snps.total <- g.dat@gtdata@nsnps
print(c(sample.size, snps.total))
# Trait
summary(g.dat@phdata$disease)
hist(g.dat@phdata$disease, main="Quantitative Phenotype data summary", xlab = "Systolic pressure measure", freq = F,breaks=20, col="gray")
rug(g.dat@phdata$disease)
###
# tests for association
###
# GLM test
test.snp <- scan.glm('disease ~ CRSNP', family = gaussian(), data = g.dat)
names(test.snp)
alpha <- 5e-8
test.snp$snpnames[test.snp$P1df < alpha]
test.snp$P1df[test.snp$P1df < alpha]
# Score test
test.qt <- qtscore(disease, data = g.dat, trait = "gaussian")
slotNames(test.qt)
names(test.qt@results)
test.qt@lambda
descriptives.scan(test.qt)
rownames(results(test.qt))[results(test.qt)$P1df < alpha]
results(test.qt)$P1df[results(test.qt)$P1df < alpha]
results(test.qt)$Pc1df[results(test.qt)$Pc1df < alpha]
# QQ plot
obs <- sort(results(test.qt)$P1df)
ept <- c(1:length(obs)) / (length(obs) + 1)
plot(-log10(ept), -log10(obs), main = "GWAS QQ plot, qtl", xlab="Expected -log10(pvalue)", ylab="Observed -log10(pvalue)")
abline(0, 1, col = "red")
abline(h = 8, lty = 2)
# Manhattan plot
plot(test.qt, col = "black")
# Adding confounders
test.qt.sex <- qtscore(disease ~ sex, data = g.dat, trait = "gaussian")
rownames(results(test.qt.sex))[results(test.qt)$P1df < alpha]
summary(lm(disease ~ sex, data = g.dat))
###
# MDS
###
gkin <- ibs(g.dat, weight = "freq")
gkin[1:10,1:10]
cps.full <- cmdscale(as.dist(.5 - gkin), eig = T, k = 10)
names(cps.full)
cps <- cps.full$points
plot(cps[,1], cps[,2], pch = g.dat@phdata$popn)
legend(-0.16, 0.06, c("TSI","MEX", "CEU"), pch = c(1,2,3))
###
# Corrected test
###
# Incorporating PCs as predictors
colnames(cps)<-c('C1','C2','C3','C4','C5','C6','C7','C8','C9','C10')
gpc.dat <- g.dat
gpc.dat@phdata<-cbind(g.dat@phdata, cps)
test.pc.a <- scan.glm('disease ~ CRSNP + C1 + C2 + C3 + C4 + C5', family=gaussian(), data = gpc.dat)
test.pc.a$snpnames[test.pc.a$P1df < alpha]
test.pc.a$P1df[test.pc.a$P1df < alpha]
test.pc.b <- qtscore(disease ~ C1 + C2 + C3 + C4 + C5, data = gpc.dat, trait = "gaussian")
test.pc.b@lambda
# scree plot
plot(cps.full$eig[1:10]/sum(cps.full$eig), axes = F, type = "b", xlab = "Components", ylim = c(0,0.05), ylab = "Proportion of Variations", main = "MDS analysis scree plot")
axis(1, 1:10)
axis(2)
# cumulative plot
plot(cumsum(cps.full$eig[1:10])/sum(cps.full$eig), axes = F, type = "b", ylim = c(0,0.2), xlab = "Components", ylab = "Proportion of Variations", main = "MDS analysis cumulative plot")
axis(1, 1:10)
axis(2)
# Genomic control
# Uncorrected GIF
test.qt@lambda
# Corrected p-value
row.names(results(test.qt))[results(test.qt)$Pc1df < alpha]
results(test.qt)$Pc1df[results(test.qt)$Pc1df < alpha]
# Check for inflation of statistic
obs <- sort(results(test.qt)$chi2.1df)
ept <- sort(qchisq(1:length(obs) / (length(obs) + 1), df = 1))
plot(ept, obs, main = "Genomic control (slope is the inflation factor)", xlab="Expected chisq, 1df", ylab="Observed chisq, 1df")
abline(0, 1, col = "red")
abline(0, test.qt@lambda[1], lty = 2)
# Definition of GIF
# Conventional definition
median(results(test.qt)$chi2.1df)/0.456
# GenABEL definition
lm(obs~ept)$coef[2]
# QQ plot
obs <- sort(results(test.qt)$Pc1df)
ept <- c(1:length(obs)) / (length(obs) + 1)
plot(-log10(ept), -log10(obs), main = "GWAS QQ plot adj. via Genomic Control", xlab="Expected -log10(pvalue)", ylab="Observed -log10(pvalue)")
abline(0, 1, col = "red")
abline(h = 8, lty = 2)
# EIGENSTRAT
adj.gkin = gkin
diag(adj.gkin) = hom(g.dat)$Var
# naxes = 3 is default value
test.eg <- egscore(disease, data = g.dat, kin = adj.gkin, naxes = 2)
descriptives.scan(test.eg)
snp.eg <- row.names(results(test.eg))[results(test.eg)$P1df < alpha]
pvalue.eg <- results(test.eg)$P1df[results(test.eg)$P1df < alpha]
lambda.eg <- test.eg@lambda
snp.eg
pvalue.eg
lambda.eg
# Change #PCs
for (k in 1:10){
test.tmp <- egscore(disease, data = g.dat, kin = adj.gkin, naxes = k)
print(test.tmp@lambda$estimate)
}
# QQ plot
obs <- sort(results(test.eg)$Pc1df)
ept <- c(1:length(obs)) / (length(obs) + 1)
qqplot(-log10(ept), -log10(obs), main = "GWAS QQ plot adj. w/ EIGENSTRAT", xlab="Expected -log10(pvalue)", ylab="Observed -log10(pvalue)")
abline(0, 1, col = "red")
abline(h = 8, lty = 2)
# Manhattan plot comparison
plot(test.qt, col = "black")
add.plot(test.eg, col = "gray", pch = 3)
legend("topright", c("Original plot","After correction w/ EIGENSTRAT"), pch = c(1,3))
###
# Basic test, binary trait
###
# load files to GenABEL
convert.snp.tped(tped = "gwa_gabel.tped", tfam = "gwa_gabel.tfam", out = "gwa_gabel.raw", strand = "u")
b.dat <- load.gwaa.data(phen = "gwa_gabel.praw", gen = "gwa_gabel.raw", force = T)
slotNames(b.dat)
slotNames(b.dat@gtdata)
colnames(b.dat@phdata)
# sample size
b.dat@gtdata@nids
# number of cases and controls
case.size <- length(which(b.dat@phdata$disease == 1))
control.size <- length(which(b.dat@phdata$disease == 0))
case.size
control.size
# number of SNPs
snpsb.total <- b.dat@gtdata@nsnps
# GLM test
testb.snp <- scan.glm('disease ~ CRSNP', family = binomial(), data = b.dat)
names(testb.snp)
alpha <- 5e-8
testb.snp$snpnames[testb.snp$P1df < alpha]
testb.snp$P1df[testb.snp$P1df < alpha]
# Score test
testb.qt <- qtscore(disease, data = b.dat, trait = "binomial")
slotNames(testb.qt)
descriptives.scan(testb.qt)
row.names(results(testb.qt))[results(testb.qt)$P1df < alpha]
results(testb.qt)$P1df[results(testb.qt)$P1df < alpha]
results(testb.qt)$Pc1df[results(testb.qt)$Pc1df < alpha]
GxG Interaction
./plink --noweb --ped simcasecon.ped --map simcasecon.map --assoc
./plink --noweb --ped simcasecon.ped --map simcasecon.map --fast-epistasis
./plink --noweb --ped simcasecon.ped --map simcasecon.map --fast-epistasis --case-only
./plink --noweb --ped simcasecon.ped --map simcasecon.map --epistasis
./plink --noweb --ped simcasecon.ped --map simcasecon.map --recodeA --out recoded
./plink --noweb --ped simcasecon.ped --map simcasecon.map --make-bed --out cassiformat
R
# The following commands are in the R environment
je <-read.table("cassi.out", header=T)
je
library(ORMDR)
recoded<-read.table("recoded.raw", header=T)
head(recoded)
newdata<-recoded[7:106]
ormdrdata<-cbind(newdata,recoded$PHENOTYPE-1)
names(ormdrdata)[101]<-"casestatus"
head(ormdrdata)
mdr1<-mdr.c(ormdrdata, colresp=101, cs=1, combi=1, cv.fold = 10)
mdr1$min.comb
mdr2<-mdr.c(ormdrdata, colresp=101, cs=1, combi=2, cv.fold = 10)
mdr2$min.comb
mdr3<-mdr.c(ormdrdata, colresp=101, cs=1, combi=3, cv.fold = 10)
mdr3$min.comb
mdr1$test.erate
mdr2$test.erate
mdr3$test.erate
mdr1mean<-mean(mdr1$test.erate)
mdr2mean<-mean(mdr2$test.erate)
mdr3mean<-mean(mdr3$test.erate)
mdr1mean
mdr2mean
mdr3mean
mdr2$best.combi
mdr2$min.comb
mdr3$best.combi
mdr3$min.comb
logreg12<-glm(casestatus ~ factor(snp1_2)*factor(snp2_1), family=binomial,
data=ormdrdata)
summary(logreg12)
anova(logreg12)
pchisq(701.68,4,lower.tail=F)
pchisq(703.82,8,lower.tail=F)
logreg345<-glm(casestatus ~ factor(snp3_2)*factor(snp4_2)*factor(snp5_2),
family=binomial, data=ormdrdata)
summary(logreg345)
anova(logreg345)
pchisq(45.6,8,lower.tail=F)
q()
### The following commands are in the linux shell
./BEAM3 beam3data.txt -o beam3results
./BEAM3 beam3data.txt -o beam3results -T 10
Plink - Part 1 - Data QC
plink --file GWAS
plink --file GWAS --mind 0.10 --recode --out GWAS_clean_mind
plink --file GWAS_clean_mind --maf 0.05 --recode --out MAF_greater_5
plink --file GWAS_clean_mind --exclude MAF_greater_5.map --recode --out MAF_less_5
plink --file MAF_greater_5 --geno 0.05 --recode --out MAF_greater_5_clean
plink --file MAF_less_5 --geno 0.01 --recode --out MAF_less_5_clean
plink --file MAF_greater_5_clean --merge MAF_less_5_clean.ped MAF_less_5_clean.map --recode --out GWAS_MAF_clean
plink --file GWAS_MAF_clean --mind 0.03 --recode --out GWAS_clean2
plink --file GWAS_clean2 --check-sex --out GWAS_sex_checking
#### in R - open R by simply typing R
setwd("to_your_working_directory/")
sexcheck = read.table("GWAS_sex_checking.sexcheck", header=T)
names(sexcheck)
sex_problem = sexcheck[which(sexcheck$STATUS=="PROBLEM"),]
sex_problem
q()
##################################
plink --file GWAS_clean2 --genome --out duplicates
#### in R
setwd("to_your_working_directory/")
dups = read.table("duplicates.genome", header = T)
problem_pairs = dups[which(dups$PI_HAT > 0.4),]
problem_pairs
problem_pairs = dups[which(dups$PI_HAT > 0.05),]
myvars = c("FID1", "IID1", "FID2", "IID2", "PI_HAT")
problem_pairs[myvars]
q()
######
plink --file GWAS_clean2 --remove IBS_excluded.txt --recode --out GWAS_clean3
plink --file GWAS_clean3 --het
###### in R
Dataset <- read.table("plink.het", header=TRUE, sep="", na.strings="NA", dec=".", strip.white=TRUE)
mean(Dataset$F)
sd(Dataset$F)
jpeg("hist.jpeg", height=1000, width=1000)
hist(scale(Dataset$F), xlim=c(-4,4))
dev.off()
q()
######
plink --file GWAS_clean3 --pheno pheno.txt --pheno-name Aff --hardy
##### in R
hardy = read.table("plink.hwe", header = T)
names(hardy)
hwe_prob = hardy[which(hardy$P < 0.0000009),]
hwe_prob
q()
##########
plink --file GWAS_clean3 --exclude HWE_out.txt --recode --out GWAS_clean4
Plink - Part 2 - Controlling for Substructure
plink --file GWAS_clean4 --genome --cluster --mds-plot 10
#### in R
mydata = read.table("mds_components.txt", header=T)
mydata$pch[mydata$Group==1 ] <-15
mydata$pch[mydata$Group==2 ] <-16
mydata$pch[mydata$Group==3 ] <-2
jpeg("mds.jpeg", height=500, width=500)
plot(mydata$C1, mydata$C2 ,pch=mydata$pch)
dev.off()
q()
######
plink --file GWAS_clean4 --pheno pheno.txt --pheno-name Aff --logistic --adjust --out unadj
plink --file GWAS_clean4 --genome --cluster --pca 10 header
plink --file GWAS_clean4 --pheno pheno.txt --pheno-name Aff --covar plink.eigenvec --covar-name PC1 --logistic --adjust --out PC1
plink --file GWAS_clean4 --pheno pheno.txt --pheno-name Aff --covar plink.eigenvec --covar-name PC1-PC2 --logistic --adjust --out PC1-PC2
#### in R
broadqq <-function(pvals, title)
{
observed <- sort(pvals)
lobs <- -(log10(observed))
expected <- c(1:length(observed))
lexp <- -(log10(expected / (length(expected)+1)))
plot(c(0,7), c(0,7), col="red", lwd=3, type="l", xlab="Expected (-logP)", ylab="Observed (-logP)", xlim=c(0,max(lobs)), ylim=c(0,max(lobs)), las=1, xaxs="i", yaxs="i", bty="l", main = title)
points(lexp, lobs, pch=23, cex=.4, bg="black") }
jpeg("qqplot_compare.jpeg", height=1000, width=500)
par(mfrow=c(2,1))
aff_unadj<-read.table("unadj.assoc.logistic", header=TRUE)
aff_unadj.add.p<-aff_unadj[aff_unadj$TEST==c("ADD"),]$P
broadqq(aff_unadj.add.p,"Some Trait Unadjusted")
aff_C1C2<-read.table("PC1-PC2.assoc.logistic", header=TRUE)
aff_C1C2.add.p<-aff_C1C2[aff_C1C2$TEST==c("ADD"),]$P
broadqq(aff_C1C2.add.p, "Some Trait Adjusted for PC1 and PC2")
dev.off()
gws_unadj = aff_unadj[which(aff_unadj$P < 0.0000001),]
gws_unadj
gws_adjusted = aff_C1C2[which(aff_C1C2$P < 0.0000001),]
gws_adjusted
RV-TDT
### Variant Annotation
vtools init rvtdt
vtools import --format vcf data/data.vcf --build hg19
vtools phenotype --from_file data/phen.txt
vtools execute ANNOVAR geneanno
vtools select variant "variant.region_type like '%splicing%'or variant.mut_type like 'nonsynonymous%' or variant.mut_type like 'frameshift%' or variant.mut_type like 'stop%'" -t func_variant
vtools export func_variant --format tped --samples 'phenotype is not null' > vat_raw.tped
# set marker name as chr_pos, needs to avoid duplicate name
sort -k4 -n vat_raw.tped | awk 'BEGIN{OFS="\t";prev="None";copy=1} {$2=$1"_"$4; $3=0; if($2==prev) {$2=$2"_"copy; copy=copy+1} else {prev=$2; copy=1}; print $0}' > vat_export.tped
vtools phenotype --out family sample_name pid mid sex phenotype > vat_export.tfam
vtools use refGene-hg19_20130904
vtools update func_variant --set 'maf=0.001' # set the maf to be 0.001
vtools select func_variant -o chr pos refGene.name2 maf --header > vat_export.anno
### Phasing Trio
plink --noweb --tfile vat_export --recode12 --me 1 1 --set-me-missing --out "recode12_noME"
sort -n -k1 -k6 -k2 recode12_noME.ped | sed 's/ /\t/g' | cut -f1,3,4,5 --complement > linkage.ped cut -f2 recode12_noME.map | awk 'BEGIN{OFS="\t";} {print "M",$0}' | sed '1i\I\tid\nA\tDisease' > linkage.dat
java -Xmx10000m -jar java/linkage2beagle.jar linkage.dat linkage.ped > pre_beagle.bgl
python script/pre_phase.py -i pre_beagle.bgl -a pre_beagle_withMissing.bgl
java -Xmx10000m -jar java/beagle.jar missing=0 trios=pre_beagle.bgl out=bgl_phased verbose=false redundant=true
gunzip bgl_phased.pre_beagle.bgl.phased.gz
### RV-TDT Analysis
python script/post_phase.py -a vat_export.anno -b bgl_phased.pre_beagle.bgl.phased -o genes/
for g in `ls genes | grep tped | cut -d"." -f1 | head -20`
do
echo "running rvTDT on gene "${g}
rvTDT exercise_proj -G ./genes/${g}.tped -P ./data/rvtdt.phen -M ./genes/${g}.map --adapt 500 --alpha 0.00001 --permut 2000 --lower_cutoff 0 --upper_cutoff 100 --minVariants 3 --maxMissRatio 1 done
done
Seqspark
hdfs dfs -put demo.vcf.bz2 hdfs dfs -put demo.tsv seqspark annotation.conf seqspark qc.conf seqspark demo.conf
VAT
vtools -h vtools init VATDemo vtools import *.vcf.gz --var_info DP filter --geno_info DP_geno --build hg18 -j1 vtools liftover hg19 head phenotypes.csv vtools phenotype --from_file phenotypes.csv --delimiter "," vtools show project vtools show tables vtools show table variant vtools show samples vtools show genotypes vtools show fields vtools select variant --count vtools show genotypes > GenotypeSummary.txt head GenotypeSummary.txt vtools output variant "max(DP)" "min(DP)" "avg(DP)" "stdev(DP)" "lower_quartile(DP)" "upper_quartile(DP)" --header vtools select variant "filter=’PASS’" --count vtools select variant "filter=’PASS’" -o "max(DP)" "min(DP)" "avg(DP)" "stdev(DP)" "lower_quartile(DP)" "upper_quartile(DP)" --header vtools update variant --from_stat ’total=#(GT)’ ’num=#(alt)’ ’het=#(het)’ ’hom=#(hom)’ ’other=#(other)’ ’minDP=min(DP_geno)’ ’maxDP=max(DP_geno)’ ’meanDP=avg(DP_geno)’ ’maf=maf()’ vtools show fields vtools show table variant vtools update variant --from_stat ’totalGD10=#(GT)’ ’numGD10=#(alt)’ ’hetGD10=#(het)’ ’homGD10=#(hom)’ ’otherGD10=#(other)’ ’mafGD10=maf()’ --genotypes "DP_geno > 10" vtools show fields vtools show table variant vtools output variant chr pos maf mafGD10 --header --limit 20 vtools phenotype --set "RACE=0" --samples "filename like ’YRI%’" vtools phenotype --set "RACE=1" --samples "filename like ’CEU%’" vtools show samples --limit 10 vtools update variant --from_stat ’CEU_mafGD10=maf()’ --genotypes ’DP_geno>10’ --samples "RACE=1" vtools update variant --from_stat ’YRI_mafGD10=maf()’ --genotypes ’DP_geno>10’ --samples "RACE=0" vtools output variant chr pos mafGD10 CEU_mafGD10 YRI_mafGD10 --header --limit 10 vtools phenotype --from_stat ’CEU_totalGD10=#(GT)’ ’CEU_numGD10=#(alt)’ --genotypes ’DP_geno>10’ --samples "RACE=1" vtools phenotype --from_stat ’YRI_totalGD10=#(GT)’ ’YRI_numGD10=#(alt)’ --genotypes ’DP_geno>10’ --samples "RACE=0" vtools phenotype --output sample_nameCEU_totalGD10CEU_numGD10YRI_totalGD10YRI_numGD10 --header vtools execute ANNOVAR geneanno vtools output variant chr pos ref alt mut_type --limit 20 --header vtools_report trans_ratio variant -n num vtools_report trans_ratio variant -n numGD10 vtools select variant "DP<15" -t to_remove vtools show tables vtools remove variants to_remove -v0 vtools show tables vtools remove genotypes "DP_geno<10" -v0 vtools select variant "mut_type like ’non%’ or mut_type like ’stop%’ or region_type=’splicing’" -t v_funct vtools show tables vtools show samples --limit 5 vtools select variant --samples "RACE=1" -t CEU mkdir -p ceu cd ceu vtools init ceu --parent ../ --variants CEU --samples "RACE=1" --build hg19 vtools show project vtools select variant "CEU_mafGD10>=0.05" -t common_ceu vtools select v_funct "CEU_mafGD10<0.01" -t rare_ceu vtools use refGene vtools show annotation refGene vtools associate -h vtools show tests vtools show test LinRegBurden vtools associate common_ceu BMI --covariate SEX -m "LinRegBurden --alternative 2" -j1 --to_db EA_CV > EA_CV.asso.res grep -i error *.log less EA_CV.asso.res sort -g -k7 EA_CV.asso.res | head vtools show fields vtools associate rare_ceu BMI --covariate SEX -m "LinRegBurden --alternative 2" -g refGene.name2 -j1 --to_db EA_RV > EA_RV.asso.res grep -i error *.log | tail -10 less EA_RV.asso.res sort -g -k6 EA_RV.asso.res | head vtools associate rare_ceu BMI --covariate SEX -m "VariableThresholdsQt --alternative 2 -p 100000 --adaptive 0.0005" -g refGene.name2 -j1 --to_db EA_RV > EA_RV_VT.asso.res grep -i error *.log | tail -10 less EA_RV_VT.asso.res sort -g -k6 EA_RV_VT.asso.res | head vtools select rare_ceu "refGene.name2=’ABCC1’" -o chr pos ref alt CEU_mafGD10 numGD10 mut_type --header cd .. vtools select variant --samples "RACE=0" -t YRI mkdir -p yri cd yri vtools init yri --parent ../ --variants YRI --samples "RACE=0" --build hg19 vtools select variant "YRI_mafGD10>=0.05" -t common_yri vtools select v_funct "YRI_mafGD10<0.01" -t rare_yri vtools use refGene vtools associate common_yri BMI --covariate SEX -m "LinRegBurden --alternative 2" -j1 --to_db YA_CV > YA_CV.asso.res vtools associate rare_yri BMI --covariate SEX -m "LinRegBurden --alternative 2" -g refGene.name2 -j1 --to_db YA_RV > YA_RV.asso.res vtools associate rare_yri BMI --covariate SEX -m "VariableThresholdsQt --alternative 2 -p 100000 --adaptive 0.0005" -g refGene.name2 -j1 --to_db YA_RV > YA_RV_VT.asso.res cd .. vtools_report meta_analysis ceu/EA_RV_VT.asso.res yri/YA_RV_VT.asso.res --beta 5 --pval 6 --se 7 -n 2 --link 1 > ME\ TA_RV_VT.asso.res cut -f1,3 META_RV_VT.asso.res | head