Difference between revisions of "GeneABEL Exercise"
From Statistical Genetics Courses
(Created page with "__NOTITLE__ ==GeneABEL Exercise== <pre> plink --file GWAS_clean4 --pheno pheno.phen --pheno-name Aff --transpose --recode --out gwa_gabel --noweb plink --file GWAS_clean4 --...") |
|||
(4 intermediate revisions by the same user not shown) | |||
Line 2: | Line 2: | ||
==GeneABEL Exercise== | ==GeneABEL Exercise== | ||
− | + | R: | |
− | + | ||
− | + | <pre> # Load files | |
library(GenABEL) | library(GenABEL) | ||
convert.snp.tped(tped = "gwa_gabel_qtl.tped", tfam = "gwa_gabel_qtl.tfam", out = "gwa_gabel_qtl.raw", strand = "u") | convert.snp.tped(tped = "gwa_gabel_qtl.tped", tfam = "gwa_gabel_qtl.tfam", out = "gwa_gabel_qtl.raw", strand = "u") | ||
Line 12: | Line 11: | ||
slotNames(g.dat@gtdata) | slotNames(g.dat@gtdata) | ||
colnames(g.dat@phdata) | colnames(g.dat@phdata) | ||
+ | # sample size | ||
sample.size <- g.dat@gtdata@nids | sample.size <- g.dat@gtdata@nids | ||
+ | # number of SNPs | ||
snps.total <- g.dat@gtdata@nsnps | snps.total <- g.dat@gtdata@nsnps | ||
− | print(c(sample.size, snps.total)) | + | print(c(sample.size, snps.total)) |
+ | # Trait | ||
summary(g.dat@phdata$disease) | summary(g.dat@phdata$disease) | ||
− | hist(g.dat@phdata$disease, main="Quantitative Phenotype data summary", xlab = "Systolic pressure", freq = F,breaks=20, col="gray") | + | hist(g.dat@phdata$disease, main="Quantitative Phenotype data summary", xlab = "Systolic pressure measure", freq = F,breaks=20, col="gray") |
− | rug(g.dat@phdata$disease) | + | rug(g.dat@phdata$disease) |
+ | ### | ||
+ | # tests for association | ||
+ | ### | ||
+ | # GLM test | ||
test.snp <- scan.glm('disease ~ CRSNP', family = gaussian(), data = g.dat) | test.snp <- scan.glm('disease ~ CRSNP', family = gaussian(), data = g.dat) | ||
− | names(test.snp) | + | names(test.snp) |
− | alpha <- 5e-8 | + | alpha <- 5e-8 |
test.snp$snpnames[test.snp$P1df < alpha] | test.snp$snpnames[test.snp$P1df < alpha] | ||
test.snp$P1df[test.snp$P1df < alpha] | test.snp$P1df[test.snp$P1df < alpha] | ||
+ | # Score test | ||
test.qt <- qtscore(disease, data = g.dat, trait = "gaussian") | test.qt <- qtscore(disease, data = g.dat, trait = "gaussian") | ||
slotNames(test.qt) | slotNames(test.qt) | ||
names(test.qt@results) | names(test.qt@results) | ||
− | |||
test.qt@lambda | test.qt@lambda | ||
descriptives.scan(test.qt) | descriptives.scan(test.qt) | ||
− | + | rownames(results(test.qt))[results(test.qt)$P1df < alpha] | |
− | results(test.qt)$P1df[results(test.qt)$P1df < alpha] results(test.qt)$Pc1df[results(test.qt)$Pc1df < alpha] | + | results(test.qt)$P1df[results(test.qt)$P1df < alpha] |
+ | results(test.qt)$Pc1df[results(test.qt)$Pc1df < alpha] | ||
+ | # QQ plot | ||
obs <- sort(results(test.qt)$P1df) | obs <- sort(results(test.qt)$P1df) | ||
− | ept <- | + | ept <- c(1:length(obs)) / (length(obs) + 1) |
plot(-log10(ept), -log10(obs), main = "GWAS QQ plot, qtl", xlab="Expected -log10(pvalue)", ylab="Observed -log10(pvalue)") | plot(-log10(ept), -log10(obs), main = "GWAS QQ plot, qtl", xlab="Expected -log10(pvalue)", ylab="Observed -log10(pvalue)") | ||
abline(0, 1, col = "red") | abline(0, 1, col = "red") | ||
abline(h = 8, lty = 2) | abline(h = 8, lty = 2) | ||
+ | # Manhattan plot | ||
plot(test.qt, col = "black") | plot(test.qt, col = "black") | ||
+ | # Adding confounders | ||
test.qt.sex <- qtscore(disease ~ sex, data = g.dat, trait = "gaussian") | test.qt.sex <- qtscore(disease ~ sex, data = g.dat, trait = "gaussian") | ||
− | + | rownames(results(test.qt.sex))[results(test.qt)$P1df < alpha] | |
summary(lm(disease ~ sex, data = g.dat)) | summary(lm(disease ~ sex, data = g.dat)) | ||
− | + | ### | |
− | + | # MDS | |
− | + | ### | |
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
gkin <- ibs(g.dat, weight = "freq") | gkin <- ibs(g.dat, weight = "freq") | ||
gkin[1:10,1:10] | gkin[1:10,1:10] | ||
cps.full <- cmdscale(as.dist(.5 - gkin), eig = T, k = 10) | cps.full <- cmdscale(as.dist(.5 - gkin), eig = T, k = 10) | ||
− | names(cps.full) | + | names(cps.full) |
− | cps <- cps.full$points | + | cps <- cps.full$points |
plot(cps[,1], cps[,2], pch = g.dat@phdata$popn) | plot(cps[,1], cps[,2], pch = g.dat@phdata$popn) | ||
− | legend( | + | legend(-0.16, 0.06, c("TSI","MEX", "CEU"), pch = c(1,2,3)) |
+ | ### | ||
+ | # Corrected test | ||
+ | ### | ||
+ | # Incorporating PCs as predictors | ||
colnames(cps)<-c('C1','C2','C3','C4','C5','C6','C7','C8','C9','C10') | colnames(cps)<-c('C1','C2','C3','C4','C5','C6','C7','C8','C9','C10') | ||
gpc.dat <- g.dat | gpc.dat <- g.dat | ||
gpc.dat@phdata<-cbind(g.dat@phdata, cps) | gpc.dat@phdata<-cbind(g.dat@phdata, cps) | ||
− | test.pc.a <- scan.glm('disease ~ CRSNP + C1 + C2 + C3 + C4 + C5', family=gaussian(), data = gpc.dat) | + | test.pc.a <- scan.glm('disease ~ CRSNP + C1 + C2 + C3 + C4 + C5', family=gaussian(), data = gpc.dat) |
test.pc.a$snpnames[test.pc.a$P1df < alpha] | test.pc.a$snpnames[test.pc.a$P1df < alpha] | ||
test.pc.a$P1df[test.pc.a$P1df < alpha] | test.pc.a$P1df[test.pc.a$P1df < alpha] | ||
− | test.pc.b <- qtscore(disease ~ C1 + C2 + C3 + C4 + C5, data = gpc.dat, trait = "gaussian") | + | test.pc.b <- qtscore(disease ~ C1 + C2 + C3 + C4 + C5, data = gpc.dat, trait = "gaussian") |
test.pc.b@lambda | test.pc.b@lambda | ||
+ | # scree plot | ||
plot(cps.full$eig[1:10]/sum(cps.full$eig), axes = F, type = "b", xlab = "Components", ylim = c(0,0.05), ylab = "Proportion of Variations", main = "MDS analysis scree plot") | plot(cps.full$eig[1:10]/sum(cps.full$eig), axes = F, type = "b", xlab = "Components", ylim = c(0,0.05), ylab = "Proportion of Variations", main = "MDS analysis scree plot") | ||
axis(1, 1:10) | axis(1, 1:10) | ||
axis(2) | axis(2) | ||
+ | # cumulative plot | ||
plot(cumsum(cps.full$eig[1:10])/sum(cps.full$eig), axes = F, type = "b", ylim = c(0,0.2), xlab = "Components", ylab = "Proportion of Variations", main = "MDS analysis cumulative plot") | plot(cumsum(cps.full$eig[1:10])/sum(cps.full$eig), axes = F, type = "b", ylim = c(0,0.2), xlab = "Components", ylab = "Proportion of Variations", main = "MDS analysis cumulative plot") | ||
axis(1, 1:10) | axis(1, 1:10) | ||
axis(2) | axis(2) | ||
+ | # Genomic control | ||
+ | # Uncorrected GIF | ||
+ | test.qt@lambda | ||
+ | # Corrected p-value | ||
row.names(results(test.qt))[results(test.qt)$Pc1df < alpha] | row.names(results(test.qt))[results(test.qt)$Pc1df < alpha] | ||
results(test.qt)$Pc1df[results(test.qt)$Pc1df < alpha] | results(test.qt)$Pc1df[results(test.qt)$Pc1df < alpha] | ||
− | + | # Check for inflation of statistic | |
obs <- sort(results(test.qt)$chi2.1df) | obs <- sort(results(test.qt)$chi2.1df) | ||
− | ept <- sort(qchisq( | + | ept <- sort(qchisq(1:length(obs) / (length(obs) + 1), df = 1)) |
− | plot(ept, obs, main = "Genomic control ( | + | plot(ept, obs, main = "Genomic control (slope is the inflation factor)", xlab="Expected chisq, 1df", ylab="Observed chisq, 1df") |
abline(0, 1, col = "red") | abline(0, 1, col = "red") | ||
abline(0, test.qt@lambda[1], lty = 2) | abline(0, test.qt@lambda[1], lty = 2) | ||
+ | # Definition of GIF | ||
+ | # Conventional definition | ||
median(results(test.qt)$chi2.1df)/0.456 | median(results(test.qt)$chi2.1df)/0.456 | ||
+ | # GenABEL definition | ||
+ | lm(obs~ept)$coef[2] | ||
+ | # QQ plot | ||
obs <- sort(results(test.qt)$Pc1df) | obs <- sort(results(test.qt)$Pc1df) | ||
− | ept <- | + | ept <- c(1:length(obs)) / (length(obs) + 1) |
plot(-log10(ept), -log10(obs), main = "GWAS QQ plot adj. via Genomic Control", xlab="Expected -log10(pvalue)", ylab="Observed -log10(pvalue)") | plot(-log10(ept), -log10(obs), main = "GWAS QQ plot adj. via Genomic Control", xlab="Expected -log10(pvalue)", ylab="Observed -log10(pvalue)") | ||
abline(0, 1, col = "red") | abline(0, 1, col = "red") | ||
− | abline(h = 8, lty = 2) | + | abline(h = 8, lty = 2) |
+ | # EIGENSTRAT | ||
adj.gkin = gkin | adj.gkin = gkin | ||
diag(adj.gkin) = hom(g.dat)$Var | diag(adj.gkin) = hom(g.dat)$Var | ||
+ | # naxes = 3 is default value | ||
test.eg <- egscore(disease, data = g.dat, kin = adj.gkin, naxes = 2) | test.eg <- egscore(disease, data = g.dat, kin = adj.gkin, naxes = 2) | ||
descriptives.scan(test.eg) | descriptives.scan(test.eg) | ||
snp.eg <- row.names(results(test.eg))[results(test.eg)$P1df < alpha] | snp.eg <- row.names(results(test.eg))[results(test.eg)$P1df < alpha] | ||
− | pvalue.eg <- results(test.eg)$P1df[results(test.eg)$P1df < alpha] lambda.eg <- test.eg@lambda | + | pvalue.eg <- results(test.eg)$P1df[results(test.eg)$P1df < alpha] |
+ | lambda.eg <- test.eg@lambda | ||
snp.eg | snp.eg | ||
pvalue.eg | pvalue.eg | ||
lambda.eg | lambda.eg | ||
− | for (k in 1:10){ | + | # Change #PCs |
− | + | for (k in 1:10){ | |
+ | test.tmp <- egscore(disease, data = g.dat, kin = adj.gkin, naxes = k) | ||
print(test.tmp@lambda$estimate) | print(test.tmp@lambda$estimate) | ||
} | } | ||
+ | # QQ plot | ||
obs <- sort(results(test.eg)$Pc1df) | obs <- sort(results(test.eg)$Pc1df) | ||
− | ept <- | + | ept <- c(1:length(obs)) / (length(obs) + 1) |
− | + | qqplot(-log10(ept), -log10(obs), main = "GWAS QQ plot adj. w/ EIGENSTRAT", xlab="Expected -log10(pvalue)", ylab="Observed -log10(pvalue)") | |
abline(0, 1, col = "red") | abline(0, 1, col = "red") | ||
abline(h = 8, lty = 2) | abline(h = 8, lty = 2) | ||
+ | # Manhattan plot comparison | ||
plot(test.qt, col = "black") | plot(test.qt, col = "black") | ||
add.plot(test.eg, col = "gray", pch = 3) | add.plot(test.eg, col = "gray", pch = 3) | ||
− | legend("topright", c("Original plot","After correction w/ EIGENSTRAT"), pch = c(1,3)) | + | legend("topright", c("Original plot","After correction w/ EIGENSTRAT"), pch = c(1,3)) |
+ | ### | ||
+ | # Basic test, binary trait | ||
+ | ### | ||
+ | # load files to GenABEL | ||
+ | convert.snp.tped(tped = "gwa_gabel.tped", tfam = "gwa_gabel.tfam", out = "gwa_gabel.raw", strand = "u") | ||
+ | b.dat <- load.gwaa.data(phen = "gwa_gabel.praw", gen = "gwa_gabel.raw", force = T) | ||
+ | slotNames(b.dat) | ||
+ | slotNames(b.dat@gtdata) | ||
+ | colnames(b.dat@phdata) | ||
+ | # sample size | ||
+ | b.dat@gtdata@nids | ||
+ | # number of cases and controls | ||
+ | case.size <- length(which(b.dat@phdata$disease == 1)) | ||
+ | control.size <- length(which(b.dat@phdata$disease == 0)) | ||
+ | case.size | ||
+ | control.size | ||
+ | # number of SNPs | ||
+ | snpsb.total <- b.dat@gtdata@nsnps | ||
+ | # GLM test | ||
+ | testb.snp <- scan.glm('disease ~ CRSNP', family = binomial(), data = b.dat) | ||
+ | names(testb.snp) | ||
+ | alpha <- 5e-8 | ||
+ | testb.snp$snpnames[testb.snp$P1df < alpha] | ||
+ | testb.snp$P1df[testb.snp$P1df < alpha] | ||
+ | # Score test | ||
+ | testb.qt <- qtscore(disease, data = b.dat, trait = "binomial") | ||
+ | slotNames(testb.qt) | ||
+ | descriptives.scan(testb.qt) | ||
+ | row.names(results(testb.qt))[results(testb.qt)$P1df < alpha] | ||
+ | results(testb.qt)$P1df[results(testb.qt)$P1df < alpha] | ||
+ | results(testb.qt)$Pc1df[results(testb.qt)$Pc1df < alpha] | ||
</pre> | </pre> |
Latest revision as of 16:20, 8 June 2018
GeneABEL Exercise
R:
# Load files library(GenABEL) convert.snp.tped(tped = "gwa_gabel_qtl.tped", tfam = "gwa_gabel_qtl.tfam", out = "gwa_gabel_qtl.raw", strand = "u") g.dat <- load.gwaa.data(phen = "gwa_gabel_qtl.praw", gen = "gwa_gabel_qtl.raw", force = T) slotNames(g.dat) slotNames(g.dat@gtdata) colnames(g.dat@phdata) # sample size sample.size <- g.dat@gtdata@nids # number of SNPs snps.total <- g.dat@gtdata@nsnps print(c(sample.size, snps.total)) # Trait summary(g.dat@phdata$disease) hist(g.dat@phdata$disease, main="Quantitative Phenotype data summary", xlab = "Systolic pressure measure", freq = F,breaks=20, col="gray") rug(g.dat@phdata$disease) ### # tests for association ### # GLM test test.snp <- scan.glm('disease ~ CRSNP', family = gaussian(), data = g.dat) names(test.snp) alpha <- 5e-8 test.snp$snpnames[test.snp$P1df < alpha] test.snp$P1df[test.snp$P1df < alpha] # Score test test.qt <- qtscore(disease, data = g.dat, trait = "gaussian") slotNames(test.qt) names(test.qt@results) test.qt@lambda descriptives.scan(test.qt) rownames(results(test.qt))[results(test.qt)$P1df < alpha] results(test.qt)$P1df[results(test.qt)$P1df < alpha] results(test.qt)$Pc1df[results(test.qt)$Pc1df < alpha] # QQ plot obs <- sort(results(test.qt)$P1df) ept <- c(1:length(obs)) / (length(obs) + 1) plot(-log10(ept), -log10(obs), main = "GWAS QQ plot, qtl", xlab="Expected -log10(pvalue)", ylab="Observed -log10(pvalue)") abline(0, 1, col = "red") abline(h = 8, lty = 2) # Manhattan plot plot(test.qt, col = "black") # Adding confounders test.qt.sex <- qtscore(disease ~ sex, data = g.dat, trait = "gaussian") rownames(results(test.qt.sex))[results(test.qt)$P1df < alpha] summary(lm(disease ~ sex, data = g.dat)) ### # MDS ### gkin <- ibs(g.dat, weight = "freq") gkin[1:10,1:10] cps.full <- cmdscale(as.dist(.5 - gkin), eig = T, k = 10) names(cps.full) cps <- cps.full$points plot(cps[,1], cps[,2], pch = g.dat@phdata$popn) legend(-0.16, 0.06, c("TSI","MEX", "CEU"), pch = c(1,2,3)) ### # Corrected test ### # Incorporating PCs as predictors colnames(cps)<-c('C1','C2','C3','C4','C5','C6','C7','C8','C9','C10') gpc.dat <- g.dat gpc.dat@phdata<-cbind(g.dat@phdata, cps) test.pc.a <- scan.glm('disease ~ CRSNP + C1 + C2 + C3 + C4 + C5', family=gaussian(), data = gpc.dat) test.pc.a$snpnames[test.pc.a$P1df < alpha] test.pc.a$P1df[test.pc.a$P1df < alpha] test.pc.b <- qtscore(disease ~ C1 + C2 + C3 + C4 + C5, data = gpc.dat, trait = "gaussian") test.pc.b@lambda # scree plot plot(cps.full$eig[1:10]/sum(cps.full$eig), axes = F, type = "b", xlab = "Components", ylim = c(0,0.05), ylab = "Proportion of Variations", main = "MDS analysis scree plot") axis(1, 1:10) axis(2) # cumulative plot plot(cumsum(cps.full$eig[1:10])/sum(cps.full$eig), axes = F, type = "b", ylim = c(0,0.2), xlab = "Components", ylab = "Proportion of Variations", main = "MDS analysis cumulative plot") axis(1, 1:10) axis(2) # Genomic control # Uncorrected GIF test.qt@lambda # Corrected p-value row.names(results(test.qt))[results(test.qt)$Pc1df < alpha] results(test.qt)$Pc1df[results(test.qt)$Pc1df < alpha] # Check for inflation of statistic obs <- sort(results(test.qt)$chi2.1df) ept <- sort(qchisq(1:length(obs) / (length(obs) + 1), df = 1)) plot(ept, obs, main = "Genomic control (slope is the inflation factor)", xlab="Expected chisq, 1df", ylab="Observed chisq, 1df") abline(0, 1, col = "red") abline(0, test.qt@lambda[1], lty = 2) # Definition of GIF # Conventional definition median(results(test.qt)$chi2.1df)/0.456 # GenABEL definition lm(obs~ept)$coef[2] # QQ plot obs <- sort(results(test.qt)$Pc1df) ept <- c(1:length(obs)) / (length(obs) + 1) plot(-log10(ept), -log10(obs), main = "GWAS QQ plot adj. via Genomic Control", xlab="Expected -log10(pvalue)", ylab="Observed -log10(pvalue)") abline(0, 1, col = "red") abline(h = 8, lty = 2) # EIGENSTRAT adj.gkin = gkin diag(adj.gkin) = hom(g.dat)$Var # naxes = 3 is default value test.eg <- egscore(disease, data = g.dat, kin = adj.gkin, naxes = 2) descriptives.scan(test.eg) snp.eg <- row.names(results(test.eg))[results(test.eg)$P1df < alpha] pvalue.eg <- results(test.eg)$P1df[results(test.eg)$P1df < alpha] lambda.eg <- test.eg@lambda snp.eg pvalue.eg lambda.eg # Change #PCs for (k in 1:10){ test.tmp <- egscore(disease, data = g.dat, kin = adj.gkin, naxes = k) print(test.tmp@lambda$estimate) } # QQ plot obs <- sort(results(test.eg)$Pc1df) ept <- c(1:length(obs)) / (length(obs) + 1) qqplot(-log10(ept), -log10(obs), main = "GWAS QQ plot adj. w/ EIGENSTRAT", xlab="Expected -log10(pvalue)", ylab="Observed -log10(pvalue)") abline(0, 1, col = "red") abline(h = 8, lty = 2) # Manhattan plot comparison plot(test.qt, col = "black") add.plot(test.eg, col = "gray", pch = 3) legend("topright", c("Original plot","After correction w/ EIGENSTRAT"), pch = c(1,3)) ### # Basic test, binary trait ### # load files to GenABEL convert.snp.tped(tped = "gwa_gabel.tped", tfam = "gwa_gabel.tfam", out = "gwa_gabel.raw", strand = "u") b.dat <- load.gwaa.data(phen = "gwa_gabel.praw", gen = "gwa_gabel.raw", force = T) slotNames(b.dat) slotNames(b.dat@gtdata) colnames(b.dat@phdata) # sample size b.dat@gtdata@nids # number of cases and controls case.size <- length(which(b.dat@phdata$disease == 1)) control.size <- length(which(b.dat@phdata$disease == 0)) case.size control.size # number of SNPs snpsb.total <- b.dat@gtdata@nsnps # GLM test testb.snp <- scan.glm('disease ~ CRSNP', family = binomial(), data = b.dat) names(testb.snp) alpha <- 5e-8 testb.snp$snpnames[testb.snp$P1df < alpha] testb.snp$P1df[testb.snp$P1df < alpha] # Score test testb.qt <- qtscore(disease, data = b.dat, trait = "binomial") slotNames(testb.qt) descriptives.scan(testb.qt) row.names(results(testb.qt))[results(testb.qt)$P1df < alpha] results(testb.qt)$P1df[results(testb.qt)$P1df < alpha] results(testb.qt)$Pc1df[results(testb.qt)$Pc1df < alpha]