Multi-trait and multi-environment strategies for Genomic Selection (GS)
Simulations have been demonstrated as a powerful tool to improve animal and plant breeding programs. In addition, those tools may offer an alternative to address theoretical concepts in quantitative genetics and breeding.
Here, we will use AlphaSimR package (Gaynor et
al. 2021). The package uses stochastic simulations for design and
optimization of breeding programs. The package offers a fast, simple,
and inexpensive way to test alternative breeding programs.
rm(list=ls())
#install.packages("AlphaSimR")
require(AlphaSimR)## Carregando pacotes exigidos: AlphaSimR
## Carregando pacotes exigidos: R6
For a whole set of simulations, four simple steps must be included, as follows:
Breeding programs are often targeting the improvement of several traits at a time. However, in a multi-trait framework, we may consider the correlations in-between the traits. Correlation may vary from -1 to 1, and the signal and the magnitude will impact the response with selection.
Here, we will use AlphaSimR to simulate three scenarios,
with different strategy in selection and with different correlations
in-between pair of traits.
First, we will check a case where two traits were simulated, with a positive correlation between them. In addition, the selection will be made in only one trait and we will be able to measure the indirect response in the other trait.
rm(list=ls())
require(AlphaSimR)
set.seed(53627)
#--- 1. Founder genome
founderGen = runMacs(nInd = 100,
nChr = 10,
segSites = 100,
species = "MAIZE")
SP = SimParam$new(founderGen)
#--- 2. Define the traits
means = c(100, 100)
vars = c(10, 20)
cors = matrix(data = c( 1.0, 0.3,
0.3, 1.0),
byrow = TRUE, nrow = 2, ncol = 2)
h2s = c(0.5, 0.5)
SP$addTraitA(nQtlPerChr = 100,
mean = means,
var = vars,
corA = cors)Create the base population for later selection.
# Base population
basePop = newPop(founderGen)
# Phenotype the population
basePop = setPheno(basePop, h2 = h2s)Plotting the phenotypic and genetic mean/variances through several generations of breeding.
# Store the results
nGenerations = 10 + 1 # +1 to store starting generation
meanG = vector("list", length = nGenerations)
varG = vector("list", length = nGenerations)
# Save the starting values
meanG[[1]] = meanG(basePop)
varG[[1]] = varG(basePop)
# First selection step
nSelected = 20
newPopSelected = selectInd(pop = basePop,
nInd = nSelected,
use = "pheno",
trait = 1)
# Selection over many generations
for (generation in 1:(nGenerations - 1)) {
newPop = randCross(newPopSelected, nCrosses = nInd(basePop))
newPop = setPheno(newPop, h2 = h2s)
newPopSelected = selectInd(pop = newPop,
nInd = nSelected,
use = "pheno",
trait = 1)
# Save summaries
meanG[[1 + generation]] = meanG(newPop)
varG[[1 + generation]] = varG(newPop)
}
# Plot results
meanGTrait1 = sapply(meanG, function(x) x[1])
meanGTrait2 = sapply(meanG, function(x) x[2])
meanRanges = range(c(meanGTrait1, meanGTrait2))
varGTrait1 = sapply(varG, function(x) x[1, 1])
varGTrait2 = sapply(varG, function(x) x[2, 2])
varRanges = range(c(varGTrait1, varGTrait2))
# Plot mean of genetic values over time
plot(x = 1:nGenerations, y = meanGTrait1, type = "l", col = "blue", lwd = 3,
xlab = "Generation", ylab = "Mean of genetic values", ylim = meanRanges)
lines(x = 1:nGenerations, y = meanGTrait2, type = "l", col = "blue", lty = 3, lwd = 3)
legend(x = "topleft", legend = c("1", "2"), title = "Trait",
lwd = 3, lty = c(1, 3), col = c("blue", "blue"))The second example will consider two traits with negative correlation, for the same breeding program.
rm(list=ls())
set.seed(53627)
founderGen = runMacs(nInd = 100,
nChr = 10,
segSites = 100,
species = "MAIZE")
SP = SimParam$new(founderGen)
# Define the traits
means = c(100, 100)
vars = c(10, 20)
cors = matrix(data = c( 1.0, -0.6,
-0.6, 1.0),
byrow = TRUE, nrow = 2, ncol = 2)
h2s = c(0.5, 0.5)
SP$addTraitA(nQtlPerChr = 100,
mean = means,
var = vars,
corA = cors)Create the base population for later selection
# Base population
basePop = newPop(founderGen)
# Phenotype the population
basePop = setPheno(basePop, h2 = h2s)Plotting the phenotypic and genetic mean through several generations of breeding.
# Store the results
nGenerations = 10 + 1 # +1 to store starting generation
meanG = vector("list", length = nGenerations)
varG = vector("list", length = nGenerations)
# Save the starting values
meanG[[1]] = meanG(basePop)
varG[[1]] = varG(basePop)
# First selection step
nSelected = 20
newPopSelected = selectInd(pop = basePop,
nInd = nSelected,
use = "pheno",
trait = 1)
# Selection over many generations
for (generation in 1:(nGenerations - 1)) {
newPop = randCross(newPopSelected, nCrosses = nInd(basePop))
newPop = setPheno(newPop, h2 = h2s)
newPopSelected = selectInd(pop = newPop,
nInd = nSelected,
use = "pheno",
trait = 1)
# Save summaries
meanG[[1 + generation]] = meanG(newPop)
varG[[1 + generation]] = varG(newPop)
}
# Plot results
meanGTrait1 = sapply(meanG, function(x) x[1])
meanGTrait2 = sapply(meanG, function(x) x[2])
meanRanges = range(c(meanGTrait1, meanGTrait2))
varGTrait1 = sapply(varG, function(x) x[1, 1])
varGTrait2 = sapply(varG, function(x) x[2, 2])
varRanges = range(c(varGTrait1, varGTrait2))
# Plot mean of genetic values over time
plot(x = 1:nGenerations, y = meanGTrait1, type = "l", col = "blue", lwd = 3,
xlab = "Generation", ylab = "Mean of genetic values", ylim = meanRanges)
lines(x = 1:nGenerations, y = meanGTrait2, type = "l", col = "blue", lty = 3, lwd = 3)
legend(x = "topleft", legend = c("1", "2"), title = "Trait",
lwd = 3, lty = c(1, 3), col = c("blue", "blue"))In this practice, we will show how to implement Genomic prediction/Genomic selection models in a Multi-trait multienvironment framework.
To perform the analyses, we will need the following packages:
## Carregando pacotes exigidos: AGHmatrix
## Carregando pacotes exigidos: BGLR
## Carregando pacotes exigidos: tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.2 ✔ purrr 1.0.1
## ✔ tibble 3.2.1 ✔ dplyr 1.1.2
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::mutate() masks AlphaSimR::mutate()
This dataset was simulated using the coalescent theory implemented in \(AlphasimR\) package (Gaynor et al., 2021). This dataset mimic an evaluation of 500 maize genotypes, in six location, and four traits were measured. Also, a set of 3000 single nucleotide polymorphism (SNPs) were randomly sampled through the 10 pair of chromosomes.
Phenotypic data
We generated BLUEs (best linear unbiased estimation) for each trait, and
it can be loaded from Phenotypes.csv.
# Loading the data
Phenotypes = read.csv("Phenotypes.csv")
# As factor
Phenotypes$Genotype = Phenotypes$Genotype %>% as.factor
Phenotypes$Env = Phenotypes$Env %>% as.factorPhenotypes[,c(1,3:6)] %>%
pivot_longer(., cols = c('Trait1','Trait2','Trait3','Trait4')) %>%
ggplot(., aes(x = Env, y = value)) +
geom_boxplot(color="brown", fill="orange", alpha=0.2) +
theme(strip.text = element_text(face = "bold", size = 20, colour = 'blue'))+
labs(x = 'Environment', title = "Traits (BLUEs) distribution across environments",y="Density")+
facet_wrap(~name,scales = "free")
Using the trait 1 over four environments.
# Organizing the phenotypic data
Y<- Phenotypes[,c(1,2,3)] %>%
pivot_wider(., names_from = 'Env', values_from = 'Trait1')
# Scale the data
y = scale(Y[,-1])Genomic data
The genomic data (3000 SNP markers) was coded as 0,1,2 for aa,Aa,AA,
respectively. The dataset for all 500 genotypes can be loaded from
Genotypes.csv.
Creating G Matrix
Here, we will compose the additive relationship matrix (G matrix) following the propositions made by VanRaden (2008).
## Initial data:
## Number of Individuals: 500
## Number of Markers: 3000
##
## Missing data check:
## Total SNPs: 3000
## 0 SNPs dropped due to missing data threshold of 1
## Total of: 3000 SNPs
## MAF check:
## No SNPs with MAF below 0
## Monomorphic check:
## No monomorphic SNPs
## Summary check:
## Initial: 3000 SNPs
## Final: 3000 SNPs ( 0 SNPs removed)
##
## Completed! Time = 2.32 seconds
Statistical model
The GBLUP model in a multitrait framework is given as follows:
\[ y = 1u + Za + e \]
where \(y\) is the matrix of BLUEs data for the target traits; \(u\) represents the mean (fixed); \(a\) is the matrix of genotype effects (assumed to be random), where \(a \sim N(0,\sum_a\otimes G)\), \(G\) is a matrix that describes the genomic similarities between pair of genotypes and \(\sum_a\) represents the genetic covariance matrix; and \(e\) is the residuals (random), where \(e\sim N(0,\sum_{res}\otimes I)\), \(\sum_{res}\) is the residual covariance matrix. The letter \(Z\) refers to the incidence matrix for \(a\). Here, we use the ‘Multitrait()’ function from BGLR package for fitting the model (Pérez-Rodrı́guez & Los Campos, 2022)
# CROSS-VALIDATION CV1 (Predicting traits for new lines)
#--- 1. Setting the NA's
set.seed(12345)
yNA2 <- y
yNA2[sample(1:dim(Y)[1],100),] <- NA
#--- 2. Model
fm2 <- Multitrait(
y = yNA2,
ETA = list(A=list(K=G.Mat, model='RKHS', Cov = list(type = 'UN'))),
resCov = list(type = 'DIAG'),
nIter = 1200,
burnIn = 120,
thin = 5,
verbose = FALSE)## Checking variance co-variance matrix K for linear term 1
## Ok
## Setting linear term 1
## MSx=1.49553739794528
## UNstructured covariance matrix
## df0 was set to 5
## S0 set to
## 1 2 3 4
## 1 3.545771 1.1278878 1.1837981 1.1138041
## 2 1.127888 3.3662059 1.0893164 0.9879803
## 3 1.183798 1.0893164 3.2704436 0.9928134
## 4 1.113804 0.9879803 0.9928134 3.1904703
## Initializing resCov
## Setting hyperparameters for DIAG R
## df0 set to 5 for all the traits
## S0 was set to
## 1 2 3 4
## 3.711983 3.524001 3.423749 3.340027
## Done
#--- 3. Extracting the estimated parameters
YHatInt2 <- fm2$ETAHat
#--- 4. Predictive correlation
test2<-is.na(yNA2)
tstB<-test2[,1] # Same position for all traits
CORB.T1<-cor(YHatInt2[tstB,1],y[tstB,1])
CORB.T2<-cor(YHatInt2[tstB,2],y[tstB,2])
CORB.T3<-cor(YHatInt2[tstB,3],y[tstB,3])
CORB.T4<-cor(YHatInt2[tstB,4],y[tstB,4])
#--- 4. Extracting estimates of variance parameters
(CORB.RES<-fm2$resCov$R) # residual covariance matrix## [,1] [,2] [,3] [,4]
## [1,] 0.506008 0.000000 0.0000000 0.000000
## [2,] 0.000000 0.631697 0.0000000 0.000000
## [3,] 0.000000 0.000000 0.6420519 0.000000
## [4,] 0.000000 0.000000 0.0000000 0.604116
(CORB.u<-fm2$ETA[[1]]$Cov$Omega) # genetic covariance matrix UE## 1 2 3 4
## 1 0.4590653 0.2588641 0.2559555 0.2552953
## 2 0.2588641 0.3350281 0.2118696 0.2048813
## 3 0.2559555 0.2118696 0.3066488 0.2057984
## 4 0.2552953 0.2048813 0.2057984 0.3014032
#--- 5. Trait correlation
cov2cor(CORB.u)## 1 2 3 4
## 1 1.0000000 0.6600759 0.6821918 0.6863280
## 2 0.6600759 1.0000000 0.6610090 0.6447445
## 3 0.6821918 0.6610090 1.0000000 0.6769358
## 4 0.6863280 0.6447445 0.6769358 1.0000000
#--- 1. Run single trait
yB1NA <- yNA2[,1] #Trait 1
yB2NA <- yNA2[,2] #Trait 2
yB3NA <- yNA2[,3] #Trait 3
yB4NA <- yNA2[,4] #Trait 4
#--- 2. Model
ETA2 <- list(a = list(K=G.Mat,model='RKHS'))
fmB1 <- BGLR(yB1NA,ETA=ETA2,nIter=1200,burnIn=120,verbose=FALSE)
fmB2 <- BGLR(yB2NA,ETA=ETA2,nIter=1200,burnIn=120,verbose=FALSE)
fmB3 <- BGLR(yB3NA,ETA=ETA2,nIter=1200,burnIn=120,verbose=FALSE)
fmB4 <- BGLR(yB4NA,ETA=ETA2,nIter=1200,burnIn=120,verbose=FALSE)
#--- 3. Correlations
CORB.ST1 <- cor(fmB1$yHat[tstB],y[tstB,1])
CORB.ST2 <- cor(fmB2$yHat[tstB],y[tstB,2])
CORB.ST3 <- cor(fmB3$yHat[tstB],y[tstB,3])
CORB.ST4 <- cor(fmB4$yHat[tstB],y[tstB,4])#--- 1. Correlation output from both models
CorCV1 <- matrix(NA,nrow=4,ncol=2)
colnames(CorCV1) <- c('MT','ST')
# MTM
CorCV1[1,1] <- CORB.T1
CorCV1[2,1] <- CORB.T2
CorCV1[3,1] <- CORB.T3
CorCV1[4,1] <- CORB.T4
# STM
CorCV1[1,2] <- CORB.ST1
CorCV1[2,2] <- CORB.ST2
CorCV1[3,2] <- CORB.ST3
CorCV1[4,2] <- CORB.ST4
#--- 2. Comparison table
CorCV1## MT ST
## [1,] 0.4790521 0.4018156
## [2,] 0.3719704 0.3362224
## [3,] 0.4133658 0.3524806
## [4,] 0.4197146 0.3407975
## CROSS-VALIDATION CV2 (Partial Information via Associated Traits)
#--- 1. Set to NA the
set.seed(524521)
yNA <-y
yNA[sample(1:dim(Y)[1],100),1] <- NA
yNA[sample(1:dim(Y)[1],100),2] <- NA
yNA[sample(1:dim(Y)[1],100),3] <- NA
yNA[sample(1:dim(Y)[1],100),4] <- NA
#--- 2. Model
fm <- Multitrait(y = yNA,
ETA = list(A=list(K = G.Mat, model='RKHS', Cov = list(type = 'UN'))),
nIter = 1200,
burnIn = 120,
thin = 5,
verbose = FALSE)## Checking variance co-variance matrix K for linear term 1
## Ok
## Setting linear term 1
## MSx=1.49553739794528
## UNstructured covariance matrix
## df0 was set to 5
## S0 set to
## 1 2 3 4
## 1 3.357451 1.1475108 1.230007 1.2813106
## 2 1.147511 3.5389379 1.358102 0.9823556
## 3 1.230007 1.3581023 3.507723 1.2229457
## 4 1.281311 0.9823556 1.222946 3.4048273
## Initializing resCov
## Setting hyperparameters for UNstructured R
## S0 was set to
## 1 2 3 4
## 1 5.021194 1.716145 1.839521 1.916248
## 2 1.716145 5.292614 2.031093 1.469150
## 3 1.839521 2.031093 5.245931 1.828961
## 4 1.916248 1.469150 1.828961 5.092047
## Done
#--- 3. Extracting the estimated parameters
YHatInt <- fm$ETAHat
#--- 4. Predictive correlation
test<-is.na(yNA)
tst1<-test[,1];tst2<-test[,2]<-tst3<-test[,3];tst4<-test[,4]
COR.T1<-cor(YHatInt[tst1,1],y[tst1,1])
COR.T2<-cor(YHatInt[tst2,2],y[tst2,2])
COR.T3<-cor(YHatInt[tst3,3],y[tst3,3])
COR.T4<-cor(YHatInt[tst4,4],y[tst4,4])
#--- 5. Extracting estimates of variance parameters
(COR.RES<-fm$resCov$R) # residual covariance matrix## 1 2 3 4
## 1 0.53942397 0.08142737 0.1345714 0.07613355
## 2 0.08142737 0.74555259 0.2194581 0.06983051
## 3 0.13457140 0.21945809 0.7647425 0.12847156
## 4 0.07613355 0.06983051 0.1284716 0.67923635
(COR.u<-fm$ETA[[1]]$Cov$Omega) # genetic covariance matrix UE## 1 2 3 4
## 1 0.3694649 0.1542590 0.1543947 0.2009270
## 2 0.1542590 0.2499385 0.1314739 0.1302072
## 3 0.1543947 0.1314739 0.2333951 0.1516298
## 4 0.2009270 0.1302072 0.1516298 0.2830937
#--- 6. Trait correlation
cov2cor(CORB.u)## 1 2 3 4
## 1 1.0000000 0.6600759 0.6821918 0.6863280
## 2 0.6600759 1.0000000 0.6610090 0.6447445
## 3 0.6821918 0.6610090 1.0000000 0.6769358
## 4 0.6863280 0.6447445 0.6769358 1.0000000
#--- 1. Run single trait model
y1NA <- yNA[,1] #Trait 1
y2NA <- yNA[,2] #Trait 2
y3NA <- yNA[,3] #Trait 3
y4NA <- yNA[,4] #Trait 4
#--- 2. ETA and model for each trait
ETA <- list(A=list(K=G.Mat, model='RKHS'))
fm1 <- BGLR(y1NA,ETA=ETA,nIter=1200,burnIn=120,verbose=FALSE)
fm2 <- BGLR(y2NA,ETA=ETA,nIter=1200,burnIn=120,verbose=FALSE)
fm3 <- BGLR(y3NA,ETA=ETA,nIter=1200,burnIn=120,verbose=FALSE)
fm4 <- BGLR(y4NA,ETA=ETA,nIter=1200,burnIn=120,verbose=FALSE)
#--- 3. Correlation for single trait
COR.ST1 <- cor(fm1$yHat[tst1],y[tst1,1])
COR.ST2 <- cor(fm2$yHat[tst2],y[tst2,2])
COR.ST3 <- cor(fm3$yHat[tst3],y[tst3,3])
COR.ST4 <- cor(fm4$yHat[tst4],y[tst4,4])#--- 1. Correlation output from both models
CorCV2 <- matrix(NA,nrow=4,ncol=2)
colnames(CorCV2) <- c('MT','ST')
# MTM
CorCV2[1,1] <- COR.T1
CorCV2[2,1] <- COR.T2
CorCV2[3,1] <- COR.T3
CorCV2[4,1] <- COR.T4
# STM
CorCV2[1,2] <- COR.ST1
CorCV2[2,2] <- COR.ST2
CorCV2[3,2] <- COR.ST3
CorCV2[4,2] <- COR.ST4
#--- 2. Comparison table
CorCV2## MT ST
## [1,] 0.5177216 0.4072383
## [2,] 0.6541127 0.7067715
## [3,] 0.3581804 0.2814229
## [4,] 0.3133186 0.2541725
University of Florida, mresende@ufl.edu↩︎
University of Florida, jhernandezjarqui@ufl.edu↩︎
University of Florida, deamorimpeixotom@ufl.edu↩︎