Introduction

This is an script developed using a population simulated in AlphaSimR package (Gaynor, Gorjanc, and Hickey 2021). One trait with high heritability and controlled by a few QTLs was simulated into two breeding populations. In addition, four GWAS models were implemented and an extension with models in GAPIT (Wang and Zhang 2021) is presented, as follows:

Model 1. Non Adjusted markers
Model 2. Adjusted by Q
Model 3. Adjusted by K
Model 4. Adjusted by K+Q
Models with GAPIT Extra implementations

Loading the packages for the simulation/GWAS

rm(list=ls())
library(AlphaSimR) 
## Loading required package: R6
library(rrBLUP) #For K and Q+K GWAS
library(qqman) #For Manhattan and Q-Q plots
## 
## For example usage please run: vignette('qqman')
## 
## Citation appreciated but not required:
## Turner, (2018). qqman: an R package for visualizing GWAS results using Q-Q and manhattan plots. Journal of Open Source Software, 3(25), 731, https://doi.org/10.21105/joss.00731.
## 
library(ggfortify)
## Loading required package: ggplot2
#install.packages("remotes")
#remotes::install_github("jiabowang/GAPIT3")
library(GAPIT)

Creating the population for the GWAS

1. Creating the base genome

# Global parameters
nQtlPerChr = 4 #Per pair of chromosome
nSnpPerChr = 500 # must be > nQtlPerChr

# Simulate mapping population
set.seed(18556255)
FOUNDERPOP = runMacs(nInd=1000,
                     nChr=10,
                     segSites=nSnpPerChr,
                     split=20)

# Simulate the trait of interest
SP = SimParam$
  new(FOUNDERPOP)$
  restrSegSites(minSnpPerChr=nSnpPerChr,
                minQtlPerChr=nSnpPerChr,
                overlap=TRUE)$
  addTraitA(nQtlPerChr,gamma=TRUE)$
  addSnpChip(nSnpPerChr)$
  setVarE(H2=0.8) # Trait heritability

pop = newPop(FOUNDERPOP)

2. Creating a population structure

In a way to create a population structure, we implemented two steps. The first was to split the base genome into two. We used above (runMacs function) the argument split=20, which splits the base genome in two, 20 generations ago. The second, is going to be the cycles of selections where each population go through (below). As we did split the base genome before, we will create two populations (popA and popB), where the popA takes the individuals from 1:500, and the second from 501:1000 out of 1000 individuals from the base genome. Then, the first population (popA) go through 6 cycles of selections and the second population (popB) face 4 cycles of selection. As they initially differ, after the different number of cycles of selection, it is likely that the allele frequency of the two populations would be different.

#Create 2 sub populations
popA = pop[1:500]
popB = pop[501:1000]

#Cycles of selection
for(i in 1:6){
  popA = selectCross(popA,
                     nInd=10,
                     nCrosses=50,
                     nProgeny=10)
  if(i<4){
    popB = selectCross(popB,
                       nInd=10,
                       nCrosses=50,
                       nProgeny=10)
  }
}

#Combining both pops
pop = c(popA,popB)

PCA and population genotypes

In this step we will extract the SNPs from the core population and we will create a PCA to explore the structure in the data.

# Format genotypes for rrBLUP
rawGeno = pullSnpGeno(pop)
freq = colMeans(rawGeno)/2
MAF = apply(cbind(freq,1-freq),1,min)

# Ploting PCA
pca_res <- prcomp(rawGeno)
autoplot(pca_res)

Organizing the data for the GWAS

For the GWAS analysis, we going to organize the data. We will need dosage matrix (coded -1,0,1), phenotypes values for the trait of interest, and the QTL positions.

# Genotypes/SNP matrix/dosage matrix
rawGeno = rawGeno-1
geno = as.data.frame(t(rawGeno))
geno = data.frame(snp=row.names(geno),
                  chr=rep(1:10,each=nSnpPerChr),
                  pos=rep(1:nSnpPerChr,10),
                  geno)

# Create "pheno" data.frames for rrBLUP
pheno = data.frame(gid=names(geno)[-(1:3)],
                   trait1=pop@pheno[,1])

# Tracking the information of population
phenoQ = data.frame(gid=pheno$gid,
                    subPop=factor(rep(c("a","b"),each=500)),
                    trait1=pheno$trait1)

# Find QTL locations within SNP chip and chromosome position
qtl =  paste0(rep(1:10, each = 10), '_',SP$traits[[1]]@lociLoc)

Model 1 - GWAS with no adjustament

Single marker regression

\[y = S\alpha + e\]

# No adjustments----
model0 = data.frame(snp=geno$snp,
                    chr=geno$chr,
                    pos=geno$pos,
                    trait1=rep(NA_real_,10*nSnpPerChr))

for(i in 1:(10*nSnpPerChr)){
  if(MAF[i]>=0.05){
    #Fit linear model with lm() and extract p-value
    mod1 = summary(lm(pheno$trait1~rawGeno[,i]))
    tmp = mod1[[4]][2,4]
    #Check for markers confounded with structure
    if(is.na(tmp)){ 
      model0$trait1[i] = 1
    }else{
      model0$trait1[i] = tmp
    }
  }else{
    model0$trait1[i] = 1
  }
}
#Account for p-value=0
model0$trait1[model0$trait1==0] = 1e-300

Model 2 - GWAS correcting with population structure (Q)

Single marker regression with population as fixed effect

\[y = X\beta + S\alpha + e\]

#Adjust for population structure (Q)----
modelQ = model0
for(i in 1:(10*nSnpPerChr)){
  if(MAF[i]>=0.05){
    #Fit linear model with lm() and extract p-value
    mod2 = summary(lm(pheno$trait1~phenoQ$subPop+rawGeno[,i]))[[4]]
    if(dim(mod2)[1] == 2){
      tmp = NA
    }else{
      tmp = mod2[3,4]
    }
    #Check for markers confounded with structure
    if(is.na(tmp)){ 
      modelQ$trait1[i] = 1
    }else{
      modelQ$trait1[i] = tmp
    }
  }else{
    modelQ$trait1[i] = 1
  }
}
#Account for p-value=0
modelQ$trait1[modelQ$trait1==0] = 1e-300

Model 3 - GWAS correcting with Kinship matrix (K)

Single marker regression with covariance information among the individuals. The matrix will be calculated internally by the function ’A.mat`.

\[y = S\alpha + Qv + e \]

#Adjust for kinship (K)----
modelK = GWAS(pheno,geno,plot=FALSE)
## [1] "GWAS for trait: trait1"
## [1] "Variance components estimated. Testing markers."
modelK$trait1 = 10^(-modelK$trait1) #Revert to p-value

Model 4 - GWAS correcting with both, Q and K

\[y = X\beta + S\alpha + Qv + e \]

#Adjust for both structure and kinship (Q+K)----
modelQK = GWAS(phenoQ,geno,fixed="subPop",plot=FALSE)
## [1] "GWAS for trait: trait1"
## [1] "Variance components estimated. Testing markers."
modelQK$trait1 = 10^(-modelQK$trait1) #Revert to p-value

Manhattan plot

# Assuming Bonferroni Threshold (0.05/m)
hline = -log10(0.05/ncol(rawGeno))

# Manhattan plots
op = par(mfrow=c(2,2),mai=c(.9,.9,0.9,0.9),
         mar=c(2.5,2.5,1,1)+0.1,
         mgp = c(1.5,0.5,0))

manhattan(model0,chr="chr",bp="pos",p="trait1",snp="snp",  highlight=qtl,
         main="Unadjusted", col = c("blue4", "orange3"),
          suggestiveline = FALSE, genomewideline = hline)
manhattan(modelQ,chr="chr",bp="pos",p="trait1",snp="snp",highlight=qtl,
          main="Adjusted for Q", col = c("blue4", "orange3"),
          suggestiveline = FALSE, genomewideline = hline)
manhattan(modelK,chr="chr",bp="pos",p="trait1",snp="snp",highlight=qtl,
          main="Adjusted for K", col = c("blue4", "orange3"),
          suggestiveline = FALSE, genomewideline = hline)
manhattan(modelQK,chr="chr",bp="pos",p="trait1",snp="snp",highlight=qtl,
          main="Adjusted for Q+K", col = c("blue4", "orange3"),
          suggestiveline = FALSE, genomewideline = hline)

QQ-plot

#Q-Q plots
op = par(mfrow=c(2,2),mai=c(.9,.9,0.9,0.9),
         mar=c(2.5,2.5,1,1)+0.1,
         mgp = c(1.5,0.5,0))
qq(model0$trait1,main="Unadjusted", col = "blue4")
qq(modelQ$trait1,main="Adjusted for Q", col = "blue4")
qq(modelK$trait1,main="Adjusted for K", col = "blue4")
qq(modelQK$trait1,main="Adjusted for Q+K", col = "blue4")

Extra Models with GAPIT

# Create "pheno" data.frames for rrBLUP
myYp = data.frame(gid=pop@id,
                  trait1=pop@pheno[,1])


# Geno
rawGeno = pullSnpGeno(pop)
Markers = data.frame(myYp["gid"],rawGeno)
rownames(Markers) = NULL

# markers position
Map.pos = data.frame(snp=colnames(Markers[,-1]),
                     chr=rep(1:10,each=nSnpPerChr),
                     pos=rep(1:nSnpPerChr,10)) 


# Model MLMM

mod1=GAPIT(Y=myYp, #fist column is ID
           GD=Markers,
           GM=Map.pos,
           PCA.total=2,
           model="MLMM",  # MLMM
           Geno.View.output = FALSE,
           PCA.View.output=FALSE)

# Model FarmCPU

mod2=GAPIT(Y=myYp, #fist column is ID
           GD=Markers,
           GM=Map.pos,
           PCA.total=2,
           model="FarmCPU",  # MLMM
           Geno.View.output = FALSE,
           PCA.View.output=FALSE)

# Model MLM

mod3=GAPIT(Y=myYp, #fist column is ID
           GD=Markers,
           GM=Map.pos,
           PCA.total=2,
           model="MLM",  # MLMM
           Geno.View.output = FALSE,
           PCA.View.output=FALSE)

References

Gaynor, R Chris, Gregor Gorjanc, and John M Hickey. 2021. “AlphaSimR: An r Package for Breeding Program Simulations.” G3 11 (2): jkaa017.
Wang, Jiabo, and Zhiwu Zhang. 2021. “GAPIT Version 3: Boosting Power and Accuracy for Genomic Association and Prediction.” Genomics, Proteomics & Bioinformatics 19 (4): 629–40.

  1. University of Florida, ↩︎

  2. University of Florida, ↩︎