Difference between revisions of "AdvGeneMap2018Commands"
From Statistical Genetics Courses
Serveradmin (Talk | contribs) |
Serveradmin (Talk | contribs) |
||
Line 8: | Line 8: | ||
# Load files | # Load files | ||
library(GenABEL) | library(GenABEL) | ||
− | convert.snp.tped(tped & | + | convert.snp.tped(tped = "gwa_gabel_qtl.tped", tfam = "gwa_gabel_qtl.tfam", out = "gwa_gabel_qtl.raw", strand = "u") |
− | g.dat <- load.gwaa.data(phen & | + | g.dat <- load.gwaa.data(phen = "gwa_gabel_qtl.praw", gen = "gwa_gabel_qtl.raw", force = T) |
slotNames(g.dat) | slotNames(g.dat) | ||
slotNames(g.dat@gtdata) | slotNames(g.dat@gtdata) | ||
Line 20: | Line 20: | ||
# Trait | # Trait | ||
summary(g.dat@phdata$disease) | summary(g.dat@phdata$disease) | ||
− | hist(g.dat@phdata$disease, main& | + | hist(g.dat@phdata$disease, main="Quantitative Phenotype data summary", xlab = "Systolic pressure measure", freq = F,breaks=20, col="gray") |
rug(g.dat@phdata$disease) | rug(g.dat@phdata$disease) | ||
### | ### | ||
Line 26: | Line 26: | ||
### | ### | ||
# GLM test | # GLM test | ||
− | test.snp <- scan.glm('disease ~ CRSNP', family & | + | test.snp <- scan.glm('disease ~ CRSNP', family = gaussian(), data = g.dat) |
names(test.snp) | names(test.snp) | ||
alpha <- 5e-8 | alpha <- 5e-8 | ||
Line 32: | Line 32: | ||
test.snp$P1df[test.snp$P1df < alpha] | test.snp$P1df[test.snp$P1df < alpha] | ||
# Score test | # Score test | ||
− | test.qt <- qtscore(disease, data & | + | test.qt <- qtscore(disease, data = g.dat, trait = "gaussian") |
slotNames(test.qt) | slotNames(test.qt) | ||
names(test.qt@results) | names(test.qt@results) | ||
Line 43: | Line 43: | ||
obs <- sort(results(test.qt)$P1df) | obs <- sort(results(test.qt)$P1df) | ||
ept <- c(1:length(obs)) / (length(obs) + 1) | ept <- c(1:length(obs)) / (length(obs) + 1) | ||
− | plot(-log10(ept), -log10(obs), main & | + | plot(-log10(ept), -log10(obs), main = "GWAS QQ plot, qtl", xlab="Expected -log10(pvalue)", ylab="Observed -log10(pvalue)") |
− | abline(0, 1, col & | + | abline(0, 1, col = "red") |
− | abline(h & | + | abline(h = 8, lty = 2) |
# Manhattan plot | # Manhattan plot | ||
− | plot(test.qt, col & | + | plot(test.qt, col = "black") |
# Adding confounders | # Adding confounders | ||
− | test.qt.sex <- qtscore(disease ~ sex, data & | + | test.qt.sex <- qtscore(disease ~ sex, data = g.dat, trait = "gaussian") |
rownames(results(test.qt.sex))[results(test.qt)$P1df < alpha] | rownames(results(test.qt.sex))[results(test.qt)$P1df < alpha] | ||
− | summary(lm(disease ~ sex, data & | + | summary(lm(disease ~ sex, data = g.dat)) |
### | ### | ||
# MDS | # MDS | ||
### | ### | ||
− | gkin <- ibs(g.dat, weight & | + | gkin <- ibs(g.dat, weight = "freq") |
gkin[1:10,1:10] | gkin[1:10,1:10] | ||
− | cps.full <- cmdscale(as.dist(.5 - gkin), eig & | + | cps.full <- cmdscale(as.dist(.5 - gkin), eig = T, k = 10) |
names(cps.full) | names(cps.full) | ||
cps <- cps.full$points | cps <- cps.full$points | ||
− | plot(cps[,1], cps[,2], pch & | + | plot(cps[,1], cps[,2], pch = g.dat@phdata$popn) |
− | legend(-0.16, 0.06, c("TSI","MEX", "CEU"), pch & | + | legend(-0.16, 0.06, c("TSI","MEX", "CEU"), pch = c(1,2,3)) |
### | ### | ||
# Corrected test | # Corrected test | ||
Line 69: | Line 69: | ||
gpc.dat <- g.dat | gpc.dat <- g.dat | ||
gpc.dat@phdata<-cbind(g.dat@phdata, cps) | gpc.dat@phdata<-cbind(g.dat@phdata, cps) | ||
− | test.pc.a <- scan.glm('disease ~ CRSNP + C1 + C2 + C3 + C4 + C5', family& | + | test.pc.a <- scan.glm('disease ~ CRSNP + C1 + C2 + C3 + C4 + C5', family=gaussian(), data = gpc.dat) |
test.pc.a$snpnames[test.pc.a$P1df < alpha] | test.pc.a$snpnames[test.pc.a$P1df < alpha] | ||
test.pc.a$P1df[test.pc.a$P1df < alpha] | test.pc.a$P1df[test.pc.a$P1df < alpha] | ||
− | test.pc.b <- qtscore(disease ~ C1 + C2 + C3 + C4 + C5, data & | + | test.pc.b <- qtscore(disease ~ C1 + C2 + C3 + C4 + C5, data = gpc.dat, trait = "gaussian") |
test.pc.b@lambda | test.pc.b@lambda | ||
# scree plot | # scree plot | ||
− | plot(cps.full$eig[1:10]/sum(cps.full$eig), axes & | + | plot(cps.full$eig[1:10]/sum(cps.full$eig), axes = F, type = "b", xlab = "Components", ylim = c(0,0.05), ylab = "Proportion of Variations", main = "MDS analysis scree plot") |
axis(1, 1:10) | axis(1, 1:10) | ||
axis(2) | axis(2) | ||
# cumulative plot | # cumulative plot | ||
− | plot(cumsum(cps.full$eig[1:10])/sum(cps.full$eig), axes & | + | plot(cumsum(cps.full$eig[1:10])/sum(cps.full$eig), axes = F, type = "b", ylim = c(0,0.2), xlab = "Components", ylab = "Proportion of Variations", main = "MDS analysis cumulative plot") |
axis(1, 1:10) | axis(1, 1:10) | ||
axis(2) | axis(2) | ||
Line 90: | Line 90: | ||
# Check for inflation of statistic | # Check for inflation of statistic | ||
obs <- sort(results(test.qt)$chi2.1df) | obs <- sort(results(test.qt)$chi2.1df) | ||
− | ept <- sort(qchisq(1:length(obs) / (length(obs) + 1), df & | + | ept <- sort(qchisq(1:length(obs) / (length(obs) + 1), df = 1)) |
− | plot(ept, obs, main & | + | plot(ept, obs, main = "Genomic control (slope is the inflation factor)", xlab="Expected chisq, 1df", ylab="Observed chisq, 1df") |
− | abline(0, 1, col & | + | abline(0, 1, col = "red") |
− | abline(0, test.qt@lambda[1], lty & | + | abline(0, test.qt@lambda[1], lty = 2) |
# Definition of GIF | # Definition of GIF | ||
# Conventional definition | # Conventional definition | ||
Line 102: | Line 102: | ||
obs <- sort(results(test.qt)$Pc1df) | obs <- sort(results(test.qt)$Pc1df) | ||
ept <- c(1:length(obs)) / (length(obs) + 1) | ept <- c(1:length(obs)) / (length(obs) + 1) | ||
− | plot(-log10(ept), -log10(obs), main & | + | plot(-log10(ept), -log10(obs), main = "GWAS QQ plot adj. via Genomic Control", xlab="Expected -log10(pvalue)", ylab="Observed -log10(pvalue)") |
− | abline(0, 1, col & | + | abline(0, 1, col = "red") |
− | abline(h & | + | abline(h = 8, lty = 2) |
# EIGENSTRAT | # EIGENSTRAT | ||
− | adj.gkin & | + | adj.gkin = gkin |
− | diag(adj.gkin) & | + | diag(adj.gkin) = hom(g.dat)$Var |
− | # naxes & | + | # naxes = 3 is default value |
− | test.eg <- egscore(disease, data & | + | test.eg <- egscore(disease, data = g.dat, kin = adj.gkin, naxes = 2) |
descriptives.scan(test.eg) | descriptives.scan(test.eg) | ||
snp.eg <- row.names(results(test.eg))[results(test.eg)$P1df < alpha] | snp.eg <- row.names(results(test.eg))[results(test.eg)$P1df < alpha] | ||
Line 119: | Line 119: | ||
# Change #PCs | # Change #PCs | ||
for (k in 1:10){ | for (k in 1:10){ | ||
− | test.tmp <- egscore(disease, data & | + | test.tmp <- egscore(disease, data = g.dat, kin = adj.gkin, naxes = k) |
print(test.tmp@lambda$estimate) | print(test.tmp@lambda$estimate) | ||
} | } | ||
Line 125: | Line 125: | ||
obs <- sort(results(test.eg)$Pc1df) | obs <- sort(results(test.eg)$Pc1df) | ||
ept <- c(1:length(obs)) / (length(obs) + 1) | ept <- c(1:length(obs)) / (length(obs) + 1) | ||
− | qqplot(-log10(ept), -log10(obs), main & | + | qqplot(-log10(ept), -log10(obs), main = "GWAS QQ plot adj. w/ EIGENSTRAT", xlab="Expected -log10(pvalue)", ylab="Observed -log10(pvalue)") |
− | abline(0, 1, col & | + | abline(0, 1, col = "red") |
− | abline(h & | + | abline(h = 8, lty = 2) |
# Manhattan plot comparison | # Manhattan plot comparison | ||
− | plot(test.qt, col & | + | plot(test.qt, col = "black") |
− | add.plot(test.eg, col & | + | add.plot(test.eg, col = "gray", pch = 3) |
− | legend("topright", c("Original plot","After correction w/ EIGENSTRAT"), pch & | + | legend("topright", c("Original plot","After correction w/ EIGENSTRAT"), pch = c(1,3)) |
### | ### | ||
# Basic test, binary trait | # Basic test, binary trait | ||
### | ### | ||
# load files to GenABEL | # load files to GenABEL | ||
− | convert.snp.tped(tped & | + | convert.snp.tped(tped = "gwa_gabel.tped", tfam = "gwa_gabel.tfam", out = "gwa_gabel.raw", strand = "u") |
− | b.dat <- load.gwaa.data(phen & | + | b.dat <- load.gwaa.data(phen = "gwa_gabel.praw", gen = "gwa_gabel.raw", force = T) |
slotNames(b.dat) | slotNames(b.dat) | ||
slotNames(b.dat@gtdata) | slotNames(b.dat@gtdata) | ||
Line 144: | Line 144: | ||
b.dat@gtdata@nids | b.dat@gtdata@nids | ||
# number of cases and controls | # number of cases and controls | ||
− | case.size <- length(which(b.dat@phdata$disease & | + | case.size <- length(which(b.dat@phdata$disease == 1)) |
− | control.size <- length(which(b.dat@phdata$disease & | + | control.size <- length(which(b.dat@phdata$disease == 0)) |
case.size | case.size | ||
control.size | control.size | ||
Line 151: | Line 151: | ||
snpsb.total <- b.dat@gtdata@nsnps | snpsb.total <- b.dat@gtdata@nsnps | ||
# GLM test | # GLM test | ||
− | testb.snp <- scan.glm('disease ~ CRSNP', family & | + | testb.snp <- scan.glm('disease ~ CRSNP', family = binomial(), data = b.dat) |
names(testb.snp) | names(testb.snp) | ||
alpha <- 5e-8 | alpha <- 5e-8 | ||
Line 157: | Line 157: | ||
testb.snp$P1df[testb.snp$P1df < alpha] | testb.snp$P1df[testb.snp$P1df < alpha] | ||
# Score test | # Score test | ||
− | testb.qt <- qtscore(disease, data & | + | testb.qt <- qtscore(disease, data = b.dat, trait = "binomial") |
slotNames(testb.qt) | slotNames(testb.qt) | ||
descriptives.scan(testb.qt) | descriptives.scan(testb.qt) | ||
Line 177: | Line 177: | ||
#### in R - open R by simply typing R | #### in R - open R by simply typing R | ||
setwd("to_your_working_directory/") | setwd("to_your_working_directory/") | ||
− | sexcheck & | + | sexcheck = read.table("GWAS_sex_checking.sexcheck", header=T) |
names(sexcheck) | names(sexcheck) | ||
− | sex_problem & | + | sex_problem = sexcheck[which(sexcheck$STATUS=="PROBLEM"),] |
sex_problem | sex_problem | ||
q() | q() | ||
Line 186: | Line 186: | ||
#### in R | #### in R | ||
setwd("to_your_working_directory/") | setwd("to_your_working_directory/") | ||
− | dups & | + | dups = read.table("duplicates.genome", header = T) |
− | problem_pairs & | + | problem_pairs = dups[which(dups$PI_HAT > 0.4),] |
problem_pairs | problem_pairs | ||
− | problem_pairs & | + | problem_pairs = dups[which(dups$PI_HAT > 0.05),] |
− | myvars & | + | myvars = c("FID1", "IID1", "FID2", "IID2", "PI_HAT") |
problem_pairs[myvars] | problem_pairs[myvars] | ||
q() | q() | ||
Line 197: | Line 197: | ||
plink --file GWAS_clean3 --het | plink --file GWAS_clean3 --het | ||
###### in R | ###### in R | ||
− | Dataset <- read.table("plink.het", header& | + | Dataset <- read.table("plink.het", header=TRUE, sep="", na.strings="NA", dec=".", strip.white=TRUE) |
mean(Dataset$F) | mean(Dataset$F) | ||
sd(Dataset$F) | sd(Dataset$F) | ||
− | jpeg("hist.jpeg", height& | + | jpeg("hist.jpeg", height=1000, width=1000) |
− | hist(scale(Dataset$F), xlim& | + | hist(scale(Dataset$F), xlim=c(-4,4)) |
dev.off() | dev.off() | ||
q() | q() | ||
Line 207: | Line 207: | ||
plink --file GWAS_clean3 --pheno pheno.txt --pheno-name Aff --hardy | plink --file GWAS_clean3 --pheno pheno.txt --pheno-name Aff --hardy | ||
##### in R | ##### in R | ||
− | hardy & | + | hardy = read.table("plink.hwe", header = T) |
names(hardy) | names(hardy) | ||
− | hwe_prob & | + | hwe_prob = hardy[which(hardy$P < 0.0000009),] |
hwe_prob | hwe_prob | ||
q() | q() | ||
Line 220: | Line 220: | ||
plink --file GWAS_clean4 --genome --cluster --mds-plot 10 | plink --file GWAS_clean4 --genome --cluster --mds-plot 10 | ||
#### in R | #### in R | ||
− | mydata & | + | mydata = read.table("mds_components.txt", header=T) |
− | mydata$pch[mydata$Group& | + | mydata$pch[mydata$Group==1 ] <-15 |
− | mydata$pch[mydata$Group& | + | mydata$pch[mydata$Group==2 ] <-16 |
− | mydata$pch[mydata$Group& | + | mydata$pch[mydata$Group==3 ] <-2 |
− | jpeg("mds.jpeg", height& | + | jpeg("mds.jpeg", height=500, width=500) |
− | plot(mydata$C1, mydata$C2 ,pch& | + | plot(mydata$C1, mydata$C2 ,pch=mydata$pch) |
dev.off() | dev.off() | ||
q() | q() | ||
Line 240: | Line 240: | ||
expected <- c(1:length(observed)) | expected <- c(1:length(observed)) | ||
lexp <- -(log10(expected / (length(expected)+1))) | lexp <- -(log10(expected / (length(expected)+1))) | ||
− | plot(c(0,7), c(0,7), col& | + | plot(c(0,7), c(0,7), col="red", lwd=3, type="l", xlab="Expected (-logP)", ylab="Observed (-logP)", xlim=c(0,max(lobs)), ylim=c(0,max(lobs)), las=1, xaxs="i", yaxs="i", bty="l", main = title) |
− | points(lexp, lobs, pch& | + | points(lexp, lobs, pch=23, cex=.4, bg="black") } |
− | jpeg("qqplot_compare.jpeg", height& | + | jpeg("qqplot_compare.jpeg", height=1000, width=500) |
− | par(mfrow& | + | par(mfrow=c(2,1)) |
− | aff_unadj<-read.table("unadj.assoc.logistic", header& | + | aff_unadj<-read.table("unadj.assoc.logistic", header=TRUE) |
− | aff_unadj.add.p<-aff_unadj[aff_unadj$TEST& | + | aff_unadj.add.p<-aff_unadj[aff_unadj$TEST==c("ADD"),]$P |
broadqq(aff_unadj.add.p,"Some Trait Unadjusted") | broadqq(aff_unadj.add.p,"Some Trait Unadjusted") | ||
− | aff_C1C2<-read.table("PC1-PC2.assoc.logistic", header& | + | aff_C1C2<-read.table("PC1-PC2.assoc.logistic", header=TRUE) |
− | aff_C1C2.add.p<-aff_C1C2[aff_C1C2$TEST& | + | aff_C1C2.add.p<-aff_C1C2[aff_C1C2$TEST==c("ADD"),]$P |
broadqq(aff_C1C2.add.p, "Some Trait Adjusted for PC1 and PC2") | broadqq(aff_C1C2.add.p, "Some Trait Adjusted for PC1 and PC2") | ||
dev.off() | dev.off() | ||
− | gws_unadj & | + | gws_unadj = aff_unadj[which(aff_unadj$P < 0.0000001),] |
gws_unadj | gws_unadj | ||
− | gws_adjusted & | + | gws_adjusted = aff_C1C2[which(aff_C1C2$P < 0.0000001),] |
gws_adjusted | gws_adjusted | ||
Line 277: | Line 277: | ||
head GenotypeSummary.txt | head GenotypeSummary.txt | ||
vtools output variant "max(DP)" "min(DP)" "avg(DP)" "stdev(DP)" "lower_quartile(DP)" "upper_quartile(DP)" --header | vtools output variant "max(DP)" "min(DP)" "avg(DP)" "stdev(DP)" "lower_quartile(DP)" "upper_quartile(DP)" --header | ||
− | vtools select variant "filter& | + | vtools select variant "filter=’PASS’" --count |
− | vtools select variant "filter& | + | vtools select variant "filter=’PASS’" -o "max(DP)" "min(DP)" "avg(DP)" "stdev(DP)" "lower_quartile(DP)" "upper_quartile(DP)" --header |
− | vtools update variant --from_stat ’total& | + | vtools update variant --from_stat ’total=#(GT)’ ’num=#(alt)’ ’het=#(het)’ ’hom=#(hom)’ ’other=#(other)’ ’minDP=min(DP_geno)’ ’maxDP=max(DP_geno)’ ’meanDP=avg(DP_geno)’ ’maf=maf()’ |
vtools show fields | vtools show fields | ||
vtools show table variant | vtools show table variant | ||
− | vtools update variant --from_stat ’totalGD10& | + | vtools update variant --from_stat ’totalGD10=#(GT)’ ’numGD10=#(alt)’ ’hetGD10=#(het)’ ’homGD10=#(hom)’ ’otherGD10=#(other)’ ’mafGD10=maf()’ --genotypes "DP_geno > 10" |
vtools show fields | vtools show fields | ||
vtools show table variant | vtools show table variant | ||
vtools output variant chr pos maf mafGD10 --header --limit 20 | vtools output variant chr pos maf mafGD10 --header --limit 20 | ||
− | vtools phenotype --set "RACE& | + | vtools phenotype --set "RACE=0" --samples "filename like ’YRI%’" |
− | vtools phenotype --set "RACE& | + | vtools phenotype --set "RACE=1" --samples "filename like ’CEU%’" |
vtools show samples --limit 10 | vtools show samples --limit 10 | ||
− | vtools update variant --from_stat ’CEU_mafGD10& | + | vtools update variant --from_stat ’CEU_mafGD10=maf()’ --genotypes ’DP_geno>10’ --samples "RACE=1" |
− | vtools update variant --from_stat ’YRI_mafGD10& | + | vtools update variant --from_stat ’YRI_mafGD10=maf()’ --genotypes ’DP_geno>10’ --samples "RACE=0" |
vtools output variant chr pos mafGD10 CEU_mafGD10 YRI_mafGD10 --header --limit 10 | vtools output variant chr pos mafGD10 CEU_mafGD10 YRI_mafGD10 --header --limit 10 | ||
− | vtools phenotype --from_stat ’CEU_totalGD10& | + | vtools phenotype --from_stat ’CEU_totalGD10=#(GT)’ ’CEU_numGD10=#(alt)’ --genotypes ’DP_geno>10’ --samples "RACE=1" |
− | vtools phenotype --from_stat ’YRI_totalGD10& | + | vtools phenotype --from_stat ’YRI_totalGD10=#(GT)’ ’YRI_numGD10=#(alt)’ --genotypes ’DP_geno>10’ --samples "RACE=0" |
vtools phenotype --output sample_nameCEU_totalGD10CEU_numGD10YRI_totalGD10YRI_numGD10 --header | vtools phenotype --output sample_nameCEU_totalGD10CEU_numGD10YRI_totalGD10YRI_numGD10 --header | ||
vtools execute ANNOVAR geneanno | vtools execute ANNOVAR geneanno | ||
Line 304: | Line 304: | ||
vtools show tables | vtools show tables | ||
vtools remove genotypes "DP_geno<10" -v0 | vtools remove genotypes "DP_geno<10" -v0 | ||
− | vtools select variant "mut_type like ’non%’ or mut_type like ’stop%’ or region_type& | + | vtools select variant "mut_type like ’non%’ or mut_type like ’stop%’ or region_type=’splicing’" -t v_funct |
vtools show tables | vtools show tables | ||
vtools show samples --limit 5 | vtools show samples --limit 5 | ||
− | vtools select variant --samples "RACE& | + | vtools select variant --samples "RACE=1" -t CEU |
mkdir -p ceu | mkdir -p ceu | ||
cd ceu | cd ceu | ||
− | vtools init ceu --parent ../ --variants CEU --samples "RACE& | + | vtools init ceu --parent ../ --variants CEU --samples "RACE=1" --build hg19 vtools show project |
− | vtools select variant "CEU_mafGD10>& | + | vtools select variant "CEU_mafGD10>=0.05" -t common_ceu |
vtools select v_funct "CEU_mafGD10<0.01" -t rare_ceu | vtools select v_funct "CEU_mafGD10<0.01" -t rare_ceu | ||
vtools use refGene | vtools use refGene | ||
Line 331: | Line 331: | ||
less EA_RV_VT.asso.res | less EA_RV_VT.asso.res | ||
sort -g -k6 EA_RV_VT.asso.res | head | sort -g -k6 EA_RV_VT.asso.res | head | ||
− | vtools select rare_ceu "refGene.name2& | + | vtools select rare_ceu "refGene.name2=’ABCC1’" -o chr pos ref alt CEU_mafGD10 numGD10 mut_type --header |
cd .. | cd .. | ||
− | vtools select variant --samples "RACE& | + | vtools select variant --samples "RACE=0" -t YRI |
mkdir -p yri | mkdir -p yri | ||
cd yri | cd yri | ||
− | vtools init yri --parent ../ --variants YRI --samples "RACE& | + | vtools init yri --parent ../ --variants YRI --samples "RACE=0" --build hg19 vtools select variant "YRI_mafGD10>=0.05" -t common_yri vtools select v_funct "YRI_mafGD10<0.01" -t rare_yri |
vtools use refGene | vtools use refGene | ||
vtools associate common_yri BMI --covariate SEX -m "LinRegBurden --alternative 2" -j1 --to_db YA_CV > YA_CV.asso.res | vtools associate common_yri BMI --covariate SEX -m "LinRegBurden --alternative 2" -j1 --to_db YA_CV > YA_CV.asso.res |
Revision as of 15:24, 23 January 2018
GenABEL
# Load files library(GenABEL) convert.snp.tped(tped = "gwa_gabel_qtl.tped", tfam = "gwa_gabel_qtl.tfam", out = "gwa_gabel_qtl.raw", strand = "u") g.dat <- load.gwaa.data(phen = "gwa_gabel_qtl.praw", gen = "gwa_gabel_qtl.raw", force = T) slotNames(g.dat) slotNames(g.dat@gtdata) colnames(g.dat@phdata) # sample size sample.size <- g.dat@gtdata@nids # number of SNPs snps.total <- g.dat@gtdata@nsnps print(c(sample.size, snps.total)) # Trait summary(g.dat@phdata$disease) hist(g.dat@phdata$disease, main="Quantitative Phenotype data summary", xlab = "Systolic pressure measure", freq = F,breaks=20, col="gray") rug(g.dat@phdata$disease) ### # tests for association ### # GLM test test.snp <- scan.glm('disease ~ CRSNP', family = gaussian(), data = g.dat) names(test.snp) alpha <- 5e-8 test.snp$snpnames[test.snp$P1df < alpha] test.snp$P1df[test.snp$P1df < alpha] # Score test test.qt <- qtscore(disease, data = g.dat, trait = "gaussian") slotNames(test.qt) names(test.qt@results) test.qt@lambda descriptives.scan(test.qt) rownames(results(test.qt))[results(test.qt)$P1df < alpha] results(test.qt)$P1df[results(test.qt)$P1df < alpha] results(test.qt)$Pc1df[results(test.qt)$Pc1df < alpha] # QQ plot obs <- sort(results(test.qt)$P1df) ept <- c(1:length(obs)) / (length(obs) + 1) plot(-log10(ept), -log10(obs), main = "GWAS QQ plot, qtl", xlab="Expected -log10(pvalue)", ylab="Observed -log10(pvalue)") abline(0, 1, col = "red") abline(h = 8, lty = 2) # Manhattan plot plot(test.qt, col = "black") # Adding confounders test.qt.sex <- qtscore(disease ~ sex, data = g.dat, trait = "gaussian") rownames(results(test.qt.sex))[results(test.qt)$P1df < alpha] summary(lm(disease ~ sex, data = g.dat)) ### # MDS ### gkin <- ibs(g.dat, weight = "freq") gkin[1:10,1:10] cps.full <- cmdscale(as.dist(.5 - gkin), eig = T, k = 10) names(cps.full) cps <- cps.full$points plot(cps[,1], cps[,2], pch = g.dat@phdata$popn) legend(-0.16, 0.06, c("TSI","MEX", "CEU"), pch = c(1,2,3)) ### # Corrected test ### # Incorporating PCs as predictors colnames(cps)<-c('C1','C2','C3','C4','C5','C6','C7','C8','C9','C10') gpc.dat <- g.dat gpc.dat@phdata<-cbind(g.dat@phdata, cps) test.pc.a <- scan.glm('disease ~ CRSNP + C1 + C2 + C3 + C4 + C5', family=gaussian(), data = gpc.dat) test.pc.a$snpnames[test.pc.a$P1df < alpha] test.pc.a$P1df[test.pc.a$P1df < alpha] test.pc.b <- qtscore(disease ~ C1 + C2 + C3 + C4 + C5, data = gpc.dat, trait = "gaussian") test.pc.b@lambda # scree plot plot(cps.full$eig[1:10]/sum(cps.full$eig), axes = F, type = "b", xlab = "Components", ylim = c(0,0.05), ylab = "Proportion of Variations", main = "MDS analysis scree plot") axis(1, 1:10) axis(2) # cumulative plot plot(cumsum(cps.full$eig[1:10])/sum(cps.full$eig), axes = F, type = "b", ylim = c(0,0.2), xlab = "Components", ylab = "Proportion of Variations", main = "MDS analysis cumulative plot") axis(1, 1:10) axis(2) # Genomic control # Uncorrected GIF test.qt@lambda # Corrected p-value row.names(results(test.qt))[results(test.qt)$Pc1df < alpha] results(test.qt)$Pc1df[results(test.qt)$Pc1df < alpha] # Check for inflation of statistic obs <- sort(results(test.qt)$chi2.1df) ept <- sort(qchisq(1:length(obs) / (length(obs) + 1), df = 1)) plot(ept, obs, main = "Genomic control (slope is the inflation factor)", xlab="Expected chisq, 1df", ylab="Observed chisq, 1df") abline(0, 1, col = "red") abline(0, test.qt@lambda[1], lty = 2) # Definition of GIF # Conventional definition median(results(test.qt)$chi2.1df)/0.456 # GenABEL definition lm(obs~ept)$coef[2] # QQ plot obs <- sort(results(test.qt)$Pc1df) ept <- c(1:length(obs)) / (length(obs) + 1) plot(-log10(ept), -log10(obs), main = "GWAS QQ plot adj. via Genomic Control", xlab="Expected -log10(pvalue)", ylab="Observed -log10(pvalue)") abline(0, 1, col = "red") abline(h = 8, lty = 2) # EIGENSTRAT adj.gkin = gkin diag(adj.gkin) = hom(g.dat)$Var # naxes = 3 is default value test.eg <- egscore(disease, data = g.dat, kin = adj.gkin, naxes = 2) descriptives.scan(test.eg) snp.eg <- row.names(results(test.eg))[results(test.eg)$P1df < alpha] pvalue.eg <- results(test.eg)$P1df[results(test.eg)$P1df < alpha] lambda.eg <- test.eg@lambda snp.eg pvalue.eg lambda.eg # Change #PCs for (k in 1:10){ test.tmp <- egscore(disease, data = g.dat, kin = adj.gkin, naxes = k) print(test.tmp@lambda$estimate) } # QQ plot obs <- sort(results(test.eg)$Pc1df) ept <- c(1:length(obs)) / (length(obs) + 1) qqplot(-log10(ept), -log10(obs), main = "GWAS QQ plot adj. w/ EIGENSTRAT", xlab="Expected -log10(pvalue)", ylab="Observed -log10(pvalue)") abline(0, 1, col = "red") abline(h = 8, lty = 2) # Manhattan plot comparison plot(test.qt, col = "black") add.plot(test.eg, col = "gray", pch = 3) legend("topright", c("Original plot","After correction w/ EIGENSTRAT"), pch = c(1,3)) ### # Basic test, binary trait ### # load files to GenABEL convert.snp.tped(tped = "gwa_gabel.tped", tfam = "gwa_gabel.tfam", out = "gwa_gabel.raw", strand = "u") b.dat <- load.gwaa.data(phen = "gwa_gabel.praw", gen = "gwa_gabel.raw", force = T) slotNames(b.dat) slotNames(b.dat@gtdata) colnames(b.dat@phdata) # sample size b.dat@gtdata@nids # number of cases and controls case.size <- length(which(b.dat@phdata$disease == 1)) control.size <- length(which(b.dat@phdata$disease == 0)) case.size control.size # number of SNPs snpsb.total <- b.dat@gtdata@nsnps # GLM test testb.snp <- scan.glm('disease ~ CRSNP', family = binomial(), data = b.dat) names(testb.snp) alpha <- 5e-8 testb.snp$snpnames[testb.snp$P1df < alpha] testb.snp$P1df[testb.snp$P1df < alpha] # Score test testb.qt <- qtscore(disease, data = b.dat, trait = "binomial") slotNames(testb.qt) descriptives.scan(testb.qt) row.names(results(testb.qt))[results(testb.qt)$P1df < alpha] results(testb.qt)$P1df[results(testb.qt)$P1df < alpha] results(testb.qt)$Pc1df[results(testb.qt)$Pc1df < alpha]
Plink - Part 1 - Data QC
plink --file GWAS plink --file GWAS --mind 0.10 --recode --out GWAS_clean_mind plink --file GWAS_clean_mind --maf 0.05 --recode --out MAF_greater_5 plink --file GWAS_clean_mind --exclude MAF_greater_5.map --recode --out MAF_less_5 plink --file MAF_greater_5 --geno 0.05 --recode --out MAF_greater_5_clean plink --file MAF_less_5 --geno 0.01 --recode --out MAF_less_5_clean plink --file MAF_greater_5_clean --merge MAF_less_5_clean.ped MAF_less_5_clean.map --recode --out GWAS_MAF_clean plink --file GWAS_MAF_clean --mind 0.03 --recode --out GWAS_clean2 plink --file GWAS_clean2 --check-sex --out GWAS_sex_checking #### in R - open R by simply typing R setwd("to_your_working_directory/") sexcheck = read.table("GWAS_sex_checking.sexcheck", header=T) names(sexcheck) sex_problem = sexcheck[which(sexcheck$STATUS=="PROBLEM"),] sex_problem q() ################################## plink --file GWAS_clean2 --genome --out duplicates #### in R setwd("to_your_working_directory/") dups = read.table("duplicates.genome", header = T) problem_pairs = dups[which(dups$PI_HAT > 0.4),] problem_pairs problem_pairs = dups[which(dups$PI_HAT > 0.05),] myvars = c("FID1", "IID1", "FID2", "IID2", "PI_HAT") problem_pairs[myvars] q() ###### plink --file GWAS_clean2 --remove IBS_excluded.txt --recode --out GWAS_clean3 plink --file GWAS_clean3 --het ###### in R Dataset <- read.table("plink.het", header=TRUE, sep="", na.strings="NA", dec=".", strip.white=TRUE) mean(Dataset$F) sd(Dataset$F) jpeg("hist.jpeg", height=1000, width=1000) hist(scale(Dataset$F), xlim=c(-4,4)) dev.off() q() ###### plink --file GWAS_clean3 --pheno pheno.txt --pheno-name Aff --hardy ##### in R hardy = read.table("plink.hwe", header = T) names(hardy) hwe_prob = hardy[which(hardy$P < 0.0000009),] hwe_prob q() ########## plink --file GWAS_clean3 --exclude HWE_out.txt --recode --out GWAS_clean4
Plink - Part 2 - Controlling for Substructure
plink --file GWAS_clean4 --genome --cluster --mds-plot 10 #### in R mydata = read.table("mds_components.txt", header=T) mydata$pch[mydata$Group==1 ] <-15 mydata$pch[mydata$Group==2 ] <-16 mydata$pch[mydata$Group==3 ] <-2 jpeg("mds.jpeg", height=500, width=500) plot(mydata$C1, mydata$C2 ,pch=mydata$pch) dev.off() q() ###### plink --file GWAS_clean4 --pheno pheno.txt --pheno-name Aff --logistic --adjust --out unadj plink --file GWAS_clean4 --genome --cluster --pca 10 header plink --file GWAS_clean4 --pheno pheno.txt --pheno-name Aff --covar plink.eigenvec --covar-name PC1 --logistic --adjust --out PC1 plink --file GWAS_clean4 --pheno pheno.txt --pheno-name Aff --covar plink.eigenvec --covar-name PC1-PC2 --logistic --adjust --out PC1-PC2 #### in R broadqq <-function(pvals, title) { observed <- sort(pvals) lobs <- -(log10(observed)) expected <- c(1:length(observed)) lexp <- -(log10(expected / (length(expected)+1))) plot(c(0,7), c(0,7), col="red", lwd=3, type="l", xlab="Expected (-logP)", ylab="Observed (-logP)", xlim=c(0,max(lobs)), ylim=c(0,max(lobs)), las=1, xaxs="i", yaxs="i", bty="l", main = title) points(lexp, lobs, pch=23, cex=.4, bg="black") } jpeg("qqplot_compare.jpeg", height=1000, width=500) par(mfrow=c(2,1)) aff_unadj<-read.table("unadj.assoc.logistic", header=TRUE) aff_unadj.add.p<-aff_unadj[aff_unadj$TEST==c("ADD"),]$P broadqq(aff_unadj.add.p,"Some Trait Unadjusted") aff_C1C2<-read.table("PC1-PC2.assoc.logistic", header=TRUE) aff_C1C2.add.p<-aff_C1C2[aff_C1C2$TEST==c("ADD"),]$P broadqq(aff_C1C2.add.p, "Some Trait Adjusted for PC1 and PC2") dev.off() gws_unadj = aff_unadj[which(aff_unadj$P < 0.0000001),] gws_unadj gws_adjusted = aff_C1C2[which(aff_C1C2$P < 0.0000001),] gws_adjusted
VAT
vtools -h vtools init VATDemo vtools import *.vcf.gz --var_info DP filter --geno_info DP_geno --build hg18 -j1 vtools liftover hg19 head phenotypes.csv vtools phenotype --from_file phenotypes.csv --delimiter "," vtools show project vtools show tables vtools show table variant vtools show samples vtools show genotypes vtools show fields vtools select variant --count vtools show genotypes > GenotypeSummary.txt head GenotypeSummary.txt vtools output variant "max(DP)" "min(DP)" "avg(DP)" "stdev(DP)" "lower_quartile(DP)" "upper_quartile(DP)" --header vtools select variant "filter=’PASS’" --count vtools select variant "filter=’PASS’" -o "max(DP)" "min(DP)" "avg(DP)" "stdev(DP)" "lower_quartile(DP)" "upper_quartile(DP)" --header vtools update variant --from_stat ’total=#(GT)’ ’num=#(alt)’ ’het=#(het)’ ’hom=#(hom)’ ’other=#(other)’ ’minDP=min(DP_geno)’ ’maxDP=max(DP_geno)’ ’meanDP=avg(DP_geno)’ ’maf=maf()’ vtools show fields vtools show table variant vtools update variant --from_stat ’totalGD10=#(GT)’ ’numGD10=#(alt)’ ’hetGD10=#(het)’ ’homGD10=#(hom)’ ’otherGD10=#(other)’ ’mafGD10=maf()’ --genotypes "DP_geno > 10" vtools show fields vtools show table variant vtools output variant chr pos maf mafGD10 --header --limit 20 vtools phenotype --set "RACE=0" --samples "filename like ’YRI%’" vtools phenotype --set "RACE=1" --samples "filename like ’CEU%’" vtools show samples --limit 10 vtools update variant --from_stat ’CEU_mafGD10=maf()’ --genotypes ’DP_geno>10’ --samples "RACE=1" vtools update variant --from_stat ’YRI_mafGD10=maf()’ --genotypes ’DP_geno>10’ --samples "RACE=0" vtools output variant chr pos mafGD10 CEU_mafGD10 YRI_mafGD10 --header --limit 10 vtools phenotype --from_stat ’CEU_totalGD10=#(GT)’ ’CEU_numGD10=#(alt)’ --genotypes ’DP_geno>10’ --samples "RACE=1" vtools phenotype --from_stat ’YRI_totalGD10=#(GT)’ ’YRI_numGD10=#(alt)’ --genotypes ’DP_geno>10’ --samples "RACE=0" vtools phenotype --output sample_nameCEU_totalGD10CEU_numGD10YRI_totalGD10YRI_numGD10 --header vtools execute ANNOVAR geneanno vtools output variant chr pos ref alt mut_type --limit 20 --header vtools_report trans_ratio variant -n num vtools_report trans_ratio variant -n numGD10 vtools select variant "DP<15" -t to_remove vtools show tables vtools remove variants to_remove -v0 vtools show tables vtools remove genotypes "DP_geno<10" -v0 vtools select variant "mut_type like ’non%’ or mut_type like ’stop%’ or region_type=’splicing’" -t v_funct vtools show tables vtools show samples --limit 5 vtools select variant --samples "RACE=1" -t CEU mkdir -p ceu cd ceu vtools init ceu --parent ../ --variants CEU --samples "RACE=1" --build hg19 vtools show project vtools select variant "CEU_mafGD10>=0.05" -t common_ceu vtools select v_funct "CEU_mafGD10<0.01" -t rare_ceu vtools use refGene vtools show annotation refGene vtools associate -h vtools show tests vtools show test LinRegBurden vtools associate common_ceu BMI --covariate SEX -m "LinRegBurden --alternative 2" -j1 --to_db EA_CV > EA_CV.asso.res grep -i error *.log less EA_CV.asso.res sort -g -k7 EA_CV.asso.res | head vtools show fields vtools associate rare_ceu BMI --covariate SEX -m "LinRegBurden --alternative 2" -g refGene.name2 -j1 --to_db EA_RV > EA_RV.asso.res grep -i error *.log | tail -10 less EA_RV.asso.res sort -g -k6 EA_RV.asso.res | head vtools associate rare_ceu BMI --covariate SEX -m "VariableThresholdsQt --alternative 2 -p 100000 --adaptive 0.0005" -g refGene.name2 -j1 --to_db EA_RV > EA_RV_VT.asso.res grep -i error *.log | tail -10 less EA_RV_VT.asso.res sort -g -k6 EA_RV_VT.asso.res | head vtools select rare_ceu "refGene.name2=’ABCC1’" -o chr pos ref alt CEU_mafGD10 numGD10 mut_type --header cd .. vtools select variant --samples "RACE=0" -t YRI mkdir -p yri cd yri vtools init yri --parent ../ --variants YRI --samples "RACE=0" --build hg19 vtools select variant "YRI_mafGD10>=0.05" -t common_yri vtools select v_funct "YRI_mafGD10<0.01" -t rare_yri vtools use refGene vtools associate common_yri BMI --covariate SEX -m "LinRegBurden --alternative 2" -j1 --to_db YA_CV > YA_CV.asso.res vtools associate rare_yri BMI --covariate SEX -m "LinRegBurden --alternative 2" -g refGene.name2 -j1 --to_db YA_RV > YA_RV.asso.res vtools associate rare_yri BMI --covariate SEX -m "VariableThresholdsQt --alternative 2 -p 100000 --adaptive 0.0005" -g refGene.name2 -j1 --to_db YA_RV > YA_RV_VT.asso.res cd .. vtools_report meta_analysis ceu/EA_RV_VT.asso.res yri/YA_RV_VT.asso.res --beta 5 --pval 6 --se 7 -n 2 --link 1 > ME\ TA_RV_VT.asso.res cut -f1,3 META_RV_VT.asso.res | head