This is an script developed using a population simulated in AlphaSimR package (Gaynor, Gorjanc, and Hickey 2021). One trait with high heritability and controlled by a few QTLs was simulated into two breeding populations. In addition, four GWAS models were implemented and an extension with models in GAPIT (Wang and Zhang 2021) is presented, as follows:
Model 1. Non Adjusted markers
Model 2. Adjusted by Q
Model 3. Adjusted by K
Model 4. Adjusted by K+Q
Models with GAPIT Extra implementations
## Loading required package: R6
##
## For example usage please run: vignette('qqman')
##
## Citation appreciated but not required:
## Turner, (2018). qqman: an R package for visualizing GWAS results using Q-Q and manhattan plots. Journal of Open Source Software, 3(25), 731, https://doi.org/10.21105/joss.00731.
##
## Loading required package: ggplot2
# Global parameters
nQtlPerChr = 4 #Per pair of chromosome
nSnpPerChr = 500 # must be > nQtlPerChr
# Simulate mapping population
set.seed(18556255)
FOUNDERPOP = runMacs(nInd=1000,
nChr=10,
segSites=nSnpPerChr,
split=20)
# Simulate the trait of interest
SP = SimParam$
new(FOUNDERPOP)$
restrSegSites(minSnpPerChr=nSnpPerChr,
minQtlPerChr=nSnpPerChr,
overlap=TRUE)$
addTraitA(nQtlPerChr,gamma=TRUE)$
addSnpChip(nSnpPerChr)$
setVarE(H2=0.8) # Trait heritability
pop = newPop(FOUNDERPOP)
In a way to create a population structure, we implemented two steps.
The first was to split the base genome into two. We used above
(runMacs
function) the argument split=20
,
which splits the base genome in two, 20 generations ago. The second, is
going to be the cycles of selections where each population go through
(below). As we did split the base genome before, we will create two
populations (popA and popB), where the popA takes the individuals from
1:500, and the second from 501:1000 out of 1000 individuals from the
base genome. Then, the first population (popA) go through 6 cycles of
selections and the second population (popB) face 4 cycles of selection.
As they initially differ, after the different number of cycles of
selection, it is likely that the allele frequency of the two populations
would be different.
In this step we will extract the SNPs from the core population and we will create a PCA to explore the structure in the data.
# Format genotypes for rrBLUP
rawGeno = pullSnpGeno(pop)
freq = colMeans(rawGeno)/2
MAF = apply(cbind(freq,1-freq),1,min)
# Ploting PCA
pca_res <- prcomp(rawGeno)
autoplot(pca_res)
For the GWAS analysis, we going to organize the data. We will need dosage matrix (coded -1,0,1), phenotypes values for the trait of interest, and the QTL positions.
# Genotypes/SNP matrix/dosage matrix
rawGeno = rawGeno-1
geno = as.data.frame(t(rawGeno))
geno = data.frame(snp=row.names(geno),
chr=rep(1:10,each=nSnpPerChr),
pos=rep(1:nSnpPerChr,10),
geno)
# Create "pheno" data.frames for rrBLUP
pheno = data.frame(gid=names(geno)[-(1:3)],
trait1=pop@pheno[,1])
# Tracking the information of population
phenoQ = data.frame(gid=pheno$gid,
subPop=factor(rep(c("a","b"),each=500)),
trait1=pheno$trait1)
# Find QTL locations within SNP chip and chromosome position
qtl = paste0(rep(1:10, each = 10), '_',SP$traits[[1]]@lociLoc)
Single marker regression
\[y = S\alpha + e\]
# No adjustments----
model0 = data.frame(snp=geno$snp,
chr=geno$chr,
pos=geno$pos,
trait1=rep(NA_real_,10*nSnpPerChr))
for(i in 1:(10*nSnpPerChr)){
if(MAF[i]>=0.05){
#Fit linear model with lm() and extract p-value
mod1 = summary(lm(pheno$trait1~rawGeno[,i]))
tmp = mod1[[4]][2,4]
#Check for markers confounded with structure
if(is.na(tmp)){
model0$trait1[i] = 1
}else{
model0$trait1[i] = tmp
}
}else{
model0$trait1[i] = 1
}
}
#Account for p-value=0
model0$trait1[model0$trait1==0] = 1e-300
Single marker regression with population as fixed effect
\[y = X\beta + S\alpha + e\]
#Adjust for population structure (Q)----
modelQ = model0
for(i in 1:(10*nSnpPerChr)){
if(MAF[i]>=0.05){
#Fit linear model with lm() and extract p-value
mod2 = summary(lm(pheno$trait1~phenoQ$subPop+rawGeno[,i]))[[4]]
if(dim(mod2)[1] == 2){
tmp = NA
}else{
tmp = mod2[3,4]
}
#Check for markers confounded with structure
if(is.na(tmp)){
modelQ$trait1[i] = 1
}else{
modelQ$trait1[i] = tmp
}
}else{
modelQ$trait1[i] = 1
}
}
#Account for p-value=0
modelQ$trait1[modelQ$trait1==0] = 1e-300
Single marker regression with covariance information among the individuals. The matrix will be calculated internally by the function ’A.mat`.
\[y = S\alpha + Qv + e \]
## [1] "GWAS for trait: trait1"
## [1] "Variance components estimated. Testing markers."
\[y = X\beta + S\alpha + Qv + e \]
#Adjust for both structure and kinship (Q+K)----
modelQK = GWAS(phenoQ,geno,fixed="subPop",plot=FALSE)
## [1] "GWAS for trait: trait1"
## [1] "Variance components estimated. Testing markers."
# Assuming Bonferroni Threshold (0.05/m)
hline = -log10(0.05/ncol(rawGeno))
# Manhattan plots
op = par(mfrow=c(2,2),mai=c(.9,.9,0.9,0.9),
mar=c(2.5,2.5,1,1)+0.1,
mgp = c(1.5,0.5,0))
manhattan(model0,chr="chr",bp="pos",p="trait1",snp="snp", highlight=qtl,
main="Unadjusted", col = c("blue4", "orange3"),
suggestiveline = FALSE, genomewideline = hline)
manhattan(modelQ,chr="chr",bp="pos",p="trait1",snp="snp",highlight=qtl,
main="Adjusted for Q", col = c("blue4", "orange3"),
suggestiveline = FALSE, genomewideline = hline)
manhattan(modelK,chr="chr",bp="pos",p="trait1",snp="snp",highlight=qtl,
main="Adjusted for K", col = c("blue4", "orange3"),
suggestiveline = FALSE, genomewideline = hline)
manhattan(modelQK,chr="chr",bp="pos",p="trait1",snp="snp",highlight=qtl,
main="Adjusted for Q+K", col = c("blue4", "orange3"),
suggestiveline = FALSE, genomewideline = hline)
#Q-Q plots
op = par(mfrow=c(2,2),mai=c(.9,.9,0.9,0.9),
mar=c(2.5,2.5,1,1)+0.1,
mgp = c(1.5,0.5,0))
qq(model0$trait1,main="Unadjusted", col = "blue4")
qq(modelQ$trait1,main="Adjusted for Q", col = "blue4")
qq(modelK$trait1,main="Adjusted for K", col = "blue4")
qq(modelQK$trait1,main="Adjusted for Q+K", col = "blue4")
# Create "pheno" data.frames for rrBLUP
myYp = data.frame(gid=pop@id,
trait1=pop@pheno[,1])
# Geno
rawGeno = pullSnpGeno(pop)
Markers = data.frame(myYp["gid"],rawGeno)
rownames(Markers) = NULL
# markers position
Map.pos = data.frame(snp=colnames(Markers[,-1]),
chr=rep(1:10,each=nSnpPerChr),
pos=rep(1:nSnpPerChr,10))
# Model MLMM
mod1=GAPIT(Y=myYp, #fist column is ID
GD=Markers,
GM=Map.pos,
PCA.total=2,
model="MLMM", # MLMM
Geno.View.output = FALSE,
PCA.View.output=FALSE)
# Model FarmCPU
mod2=GAPIT(Y=myYp, #fist column is ID
GD=Markers,
GM=Map.pos,
PCA.total=2,
model="FarmCPU", # MLMM
Geno.View.output = FALSE,
PCA.View.output=FALSE)
# Model MLM
mod3=GAPIT(Y=myYp, #fist column is ID
GD=Markers,
GM=Map.pos,
PCA.total=2,
model="MLM", # MLMM
Geno.View.output = FALSE,
PCA.View.output=FALSE)
University of Florida, mresende@ufl.edu↩︎
University of Florida, deamorimpeixotom@ufl.edu↩︎