Difference between revisions of "AdvGeneMap2018Commands"

From Statistical Genetics Courses

Jump to: navigation, search
Line 2: Line 2:
  
 
===Plink - Part 1 - Data QC===
 
===Plink - Part 1 - Data QC===
 
+
  plink --file GWAS
plink --file GWAS
+
 
   plink --file GWAS --mind 0.10 --recode --out GWAS_clean_mind
 
   plink --file GWAS --mind 0.10 --recode --out GWAS_clean_mind
 
   plink --file GWAS_clean_mind --maf 0.05 --recode --out MAF_greater_5
 
   plink --file GWAS_clean_mind --maf 0.05 --recode --out MAF_greater_5
Line 16: Line 15:
 
   sexcheck = read.table("GWAS_sex_checking.sexcheck", header=T)
 
   sexcheck = read.table("GWAS_sex_checking.sexcheck", header=T)
 
   names(sexcheck)
 
   names(sexcheck)
   sex_problem = sexcheck[which(sexcheck$STATUS=="PROBLEM"),]
+
   sex_problem = sexcheck[which(sexcheck$STATUS=="PROBLEM"),]
 
   sex_problem
 
   sex_problem
 
   q()
 
   q()
Line 24: Line 23:
 
   setwd("to_your_working_directory/")
 
   setwd("to_your_working_directory/")
 
   dups = read.table("duplicates.genome", header = T)
 
   dups = read.table("duplicates.genome", header = T)
   problem_pairs = dups[which(dups$PI_HAT > 0.4),]
+
   problem_pairs = dups[which(dups$PI_HAT > 0.4),]
 
   problem_pairs
 
   problem_pairs
   problem_pairs = dups[which(dups$PI_HAT > 0.05),]
+
   problem_pairs = dups[which(dups$PI_HAT > 0.05),]
 
   myvars = c("FID1", "IID1", "FID2", "IID2", "PI_HAT")
 
   myvars = c("FID1", "IID1", "FID2", "IID2", "PI_HAT")
   problem_pairs[myvars]
+
   problem_pairs[myvars]
 
   q()
 
   q()
 
   ######
 
   ######
Line 34: Line 33:
 
   plink --file GWAS_clean3 --het
 
   plink --file GWAS_clean3 --het
 
   ###### in R
 
   ###### in R
   Dataset
+
   Dataset <- read.table("plink.het", header=TRUE, sep="", na.strings="NA", dec=".", strip.white=TRUE)
 +
  mean(Dataset$F)
 +
  sd(Dataset$F)
 +
  jpeg("hist.jpeg", height=1000, width=1000)
 +
  hist(scale(Dataset$F), xlim=c(-4,4))
 +
  dev.off()
 +
  q()
 +
  ######
 +
  plink --file GWAS_clean3 --pheno pheno.txt --pheno-name Aff --hardy
 +
  ##### in R
 +
  hardy = read.table("plink.hwe", header = T)
 +
  names(hardy)
 +
  hwe_prob = hardy&#91;which(hardy$P < 0.0000009),&#93;
 +
  hwe_prob
 +
  q()
 +
  ##########
 +
  plink --file GWAS_clean3 --exclude HWE_out.txt --recode --out GWAS_clean4
 +
 
  
 
===Plink - Part 2 - Controlling for Substructure===
 
===Plink - Part 2 - Controlling for Substructure===
<nowiki>
 
 
   plink --file GWAS_clean4 --genome --cluster --mds-plot 10
 
   plink --file GWAS_clean4 --genome --cluster --mds-plot 10
 
   #### in R
 
   #### in R
 
   mydata = read.table("mds_components.txt", header=T)
 
   mydata = read.table("mds_components.txt", header=T)
   mydata$pch[mydata$Group==1 ] <-15
+
   mydata$pch&#91;mydata$Group==1 &#93; <-15
   mydata$pch[mydata$Group==2 ] <-16
+
   mydata$pch&#91;mydata$Group==2 &#93; <-16
   mydata$pch[mydata$Group==3 ] <-2
+
   mydata$pch&#91;mydata$Group==3 &#93; <-2
 
   jpeg("mds.jpeg", height=500, width=500)
 
   jpeg("mds.jpeg", height=500, width=500)
 
   plot(mydata$C1, mydata$C2 ,pch=mydata$pch)
 
   plot(mydata$C1, mydata$C2 ,pch=mydata$pch)
Line 65: Line 80:
 
   par(mfrow=c(2,1))
 
   par(mfrow=c(2,1))
 
   aff_unadj<-read.table("unadj.assoc.logistic", header=TRUE)
 
   aff_unadj<-read.table("unadj.assoc.logistic", header=TRUE)
   aff_unadj.add.p<-aff_unadj[aff_unadj$TEST==c("ADD"),]$P
+
   aff_unadj.add.p<-aff_unadj&#91;aff_unadj$TEST==c("ADD"),&#93;$P
 
   broadqq(aff_unadj.add.p,"Some Trait Unadjusted")
 
   broadqq(aff_unadj.add.p,"Some Trait Unadjusted")
 
   aff_C1C2<-read.table("PC1-PC2.assoc.logistic", header=TRUE)
 
   aff_C1C2<-read.table("PC1-PC2.assoc.logistic", header=TRUE)
   aff_C1C2.add.p<-aff_C1C2[aff_C1C2$TEST==c("ADD"),]$P
+
   aff_C1C2.add.p<-aff_C1C2&#91;aff_C1C2$TEST==c("ADD"),&#93;$P
 
   broadqq(aff_C1C2.add.p, "Some Trait Adjusted for PC1 and PC2")
 
   broadqq(aff_C1C2.add.p, "Some Trait Adjusted for PC1 and PC2")
 
   dev.off()
 
   dev.off()
   gws_unadj = aff_unadj[which(aff_unadj$P < 0.0000001),]
+
   gws_unadj = aff_unadj&#91;which(aff_unadj$P < 0.0000001),&#93;
 
   gws_unadj
 
   gws_unadj
   gws_adjusted = aff_C1C2[which(aff_C1C2$P < 0.0000001),]
+
   gws_adjusted = aff_C1C2&#91;which(aff_C1C2$P < 0.0000001),&#93;
 
   gws_adjusted
 
   gws_adjusted
<nowiki>
 

Revision as of 15:02, 23 January 2018

Plink - Part 1 - Data QC

 plink --file GWAS
 plink --file GWAS --mind 0.10 --recode --out GWAS_clean_mind
 plink --file GWAS_clean_mind --maf 0.05 --recode --out MAF_greater_5
 plink --file GWAS_clean_mind --exclude MAF_greater_5.map --recode --out MAF_less_5
 plink --file MAF_greater_5 --geno 0.05 --recode --out MAF_greater_5_clean
 plink --file MAF_less_5 --geno 0.01 --recode --out MAF_less_5_clean
 plink --file MAF_greater_5_clean --merge MAF_less_5_clean.ped MAF_less_5_clean.map --recode --out GWAS_MAF_clean
 plink --file GWAS_MAF_clean --mind 0.03 --recode --out GWAS_clean2
 plink --file GWAS_clean2 --check-sex --out GWAS_sex_checking
 #### in R - open R by simply typing R
 setwd("to_your_working_directory/")
 sexcheck = read.table("GWAS_sex_checking.sexcheck", header=T)
 names(sexcheck)
 sex_problem = sexcheck[which(sexcheck$STATUS=="PROBLEM"),]
 sex_problem
 q()
 ##################################
 plink --file GWAS_clean2 --genome --out duplicates
 #### in R
 setwd("to_your_working_directory/")
 dups = read.table("duplicates.genome", header = T)
 problem_pairs = dups[which(dups$PI_HAT > 0.4),]
 problem_pairs
 problem_pairs = dups[which(dups$PI_HAT > 0.05),]
 myvars = c("FID1", "IID1", "FID2", "IID2", "PI_HAT")
 problem_pairs[myvars]
 q()
 ######
 plink --file GWAS_clean2 --remove IBS_excluded.txt --recode --out GWAS_clean3
 plink --file GWAS_clean3 --het
 ###### in R
 Dataset <- read.table("plink.het", header=TRUE, sep="", na.strings="NA", dec=".", strip.white=TRUE)
 mean(Dataset$F)
 sd(Dataset$F)
 jpeg("hist.jpeg", height=1000, width=1000)
 hist(scale(Dataset$F), xlim=c(-4,4))
 dev.off()
 q()
 ######
 plink --file GWAS_clean3 --pheno pheno.txt --pheno-name Aff --hardy 
 ##### in R
 hardy = read.table("plink.hwe", header = T)
 names(hardy)
 hwe_prob = hardy[which(hardy$P < 0.0000009),]
 hwe_prob
 q()
 ##########
 plink --file GWAS_clean3 --exclude HWE_out.txt --recode --out GWAS_clean4


Plink - Part 2 - Controlling for Substructure

 plink --file GWAS_clean4 --genome --cluster --mds-plot 10
 #### in R
 mydata = read.table("mds_components.txt", header=T)
 mydata$pch[mydata$Group==1 ] <-15
 mydata$pch[mydata$Group==2 ] <-16
 mydata$pch[mydata$Group==3 ] <-2
 jpeg("mds.jpeg", height=500, width=500)
 plot(mydata$C1, mydata$C2 ,pch=mydata$pch)
 dev.off()
 q()
 ######
 plink --file GWAS_clean4 --pheno pheno.txt --pheno-name Aff --logistic --adjust --out unadj
 plink --file GWAS_clean4 --genome --cluster --pca 10 header
 plink --file GWAS_clean4 --pheno pheno.txt --pheno-name Aff --covar plink.eigenvec --covar-name PC1 --logistic --adjust --out PC1
 plink --file GWAS_clean4 --pheno pheno.txt --pheno-name Aff --covar plink.eigenvec --covar-name PC1-PC2 --logistic --adjust --out PC1-PC2
 #### in R
 broadqq <-function(pvals, title)
 {
 	observed <- sort(pvals)
 	lobs <- -(log10(observed))
 	expected <- c(1:length(observed)) 
 	lexp <- -(log10(expected / (length(expected)+1)))
 	plot(c(0,7), c(0,7), col="red", lwd=3, type="l", xlab="Expected (-logP)", ylab="Observed (-logP)", xlim=c(0,max(lobs)), ylim=c(0,max(lobs)), las=1, xaxs="i", yaxs="i", bty="l", main = title)
 	points(lexp, lobs, pch=23, cex=.4, bg="black") }
 jpeg("qqplot_compare.jpeg", height=1000, width=500)
 par(mfrow=c(2,1))
 aff_unadj<-read.table("unadj.assoc.logistic", header=TRUE)
 aff_unadj.add.p<-aff_unadj[aff_unadj$TEST==c("ADD"),]$P
 broadqq(aff_unadj.add.p,"Some Trait Unadjusted")
 aff_C1C2<-read.table("PC1-PC2.assoc.logistic", header=TRUE)
 aff_C1C2.add.p<-aff_C1C2[aff_C1C2$TEST==c("ADD"),]$P
 broadqq(aff_C1C2.add.p, "Some Trait Adjusted for PC1 and PC2")
 dev.off()
 gws_unadj = aff_unadj[which(aff_unadj$P < 0.0000001),]
 gws_unadj
 gws_adjusted = aff_C1C2[which(aff_C1C2$P < 0.0000001),]
 gws_adjusted