Chapter 7 HMSC
7.1 Setup
# Random effects data (study design)
StudyDesign <- sample_metadata %>%
mutate(sample2=sample) %>%
column_to_rownames("sample2") %>%
mutate(sample = factor(sample)) %>%
filter(treatment != "TG0") %>% #remove controls
select(sample)
#Calculate normalisation factor to account for genome length
normalisation_factor <- genome_metadata %>%
mutate(factor=median(length)/length) %>%
pull(factor)
# Genome count table (quantitative community data)
YData <- read_counts %>%
mutate(across(where(is.numeric), ~ round(. * normalisation_factor,0) )) %>%
mutate(across(where(is.numeric), ~ . +1 )) %>% #add +1 pseudocount to remove zeros
mutate(across(where(is.numeric), ~ log(.) )) %>% #log-transform
arrange(genome) %>%
column_to_rownames("genome") %>%
select(all_of(row.names(StudyDesign))) %>% #filter only faecal samples
as.data.frame() %>%
t() # transpose
# Fixed effects data (explanatory variables)
XData <- sample_metadata %>%
column_to_rownames("sample") %>%
filter(treatment != "TG0") %>% #remove controls
mutate(logseqdepth=read_counts %>% #total log-sequencing depth
select(all_of(row.names(StudyDesign))) %>%
colSums() %>%
log()
) %>%
mutate(treatment = factor(treatment, levels = c("TG5","TG4","TG2","TG3","TG1"))) %>%
mutate(day=as.numeric(day)) %>%
mutate(logseqdepth=as.numeric(logseqdepth)) %>%
select(day, treatment, logseqdepth)
# Genome phylogeny
PData <- genome_tree7.3 Define and Hmsc models
#Define models
model1 = Hmsc(Y=YData,
XData = XData,
XFormula = XFormula,
studyDesign = StudyDesign,
phyloTree = PData,
distr = "normal",
YScale = TRUE)
#Save list of models as an R object.
model_list = list(model1=model1)
if (!dir.exists("hmsc")){dir.create("hmsc")}
save(model_list, file = "hmsc/hmsc.Rdata")Upload hmsc/hmsc.Rdata to the HPC respecting the directory structure.
7.5 Generate Hmsc executables
The next chunk generates shell files for every combination of model, MCMC samples and MCMM thinning, ready to be launched as SLURM jobs.
modelchains <- expand.grid(model = names(model_list), sample = MCMC_samples_list, thin = MCMC_thin_list)
if (!dir.exists("hmsc")){dir.create("hmsc")}
for(i in c(1:nrow(modelchains))){
modelname=as.character(modelchains[i,1])
sample=modelchains[i,2]
thin=modelchains[i,3]
executablename <- paste0("hmsc/exe_",modelname,"_",sample,"_",thin,".sh")
fitname <- paste0("fit_",modelname,"_",sample,"_",thin,".Rdata")
convname <- paste0("conv_",modelname,"_",sample,"_",thin,".Rdata")
model <- paste0('model_list$',modelname)
psrf.beta.name <- paste0("psrf.beta.",modelname,"_",sample,"_",thin)
psrf.gamma.name <- paste0("psrf.gamma.",modelname,"_",sample,"_",thin)
psrf.rho.name <- paste0("psrf.rho.",modelname,"_",sample,"_",thin)
jobname <- paste0("hmsc_",modelname,"_",sample,"_",thin)
minutes <- 1000
code <- sprintf("#!/bin/bash
#SBATCH --job-name=%s # Job name
#SBATCH --nodes=1
#SBATCH --ntasks=4 # Run on 4 CPUs
#SBATCH --mail-user=antton.alberdi@sund.ku.dk
#SBATCH --mem=96gb # Job memory request
#SBATCH --time=%d # In minutes
# Activate conda environment
module load mamba/1.3.1
source activate /maps/projects/mjolnir1/people/jpl786/AMAC001_fibre_trial/hmsc/hmsc_env
# Run R script
Rscript -e '
library(tidyverse)
library(Hmsc)
# Load formulas and data
load(\"hmsc.Rdata\")
# Declare placeholders
modelname = \"%s\"
model = %s
fitname = \"%s\"
convname = \"%s\"
sample = %d
thin = %d
nchains = %d
# Run model fitting
m = sampleMcmc(hM = model,
samples = sample,
thin = thin,
adaptNf=rep(ceiling(0.4*sample*thin),model$nr),
transient = ceiling(0.5*sample*thin),
nChains = nchains,
nParallel = nchains)
# Run model cross-validation
partition <- createPartition(m, nfolds = 5)
cv <- computePredictedValues(m, partition=partition, nChains = 4)
# Assess chain convergence
mpost = convertToCodaObject(m,
spNamesNumbers = c(T,F),
covNamesNumbers = c(T,F),
Beta = TRUE,
Gamma = TRUE,
V = FALSE,
Sigma = FALSE,
Rho = TRUE,
Eta = FALSE,
Lambda = FALSE,
Alpha = FALSE,
Omega = FALSE,
Psi = FALSE,
Delta = FALSE) # Convert to CODA object
# Fixed effects
assign(paste0(\"psrf.beta.\", modelname,\"_\",sample,\"_\",thin), gelman.diag(mpost$Beta,multivariate=FALSE)$psrf)
# Traits
assign(paste0(\"psrf.gamma.\", modelname,\"_\",sample,\"_\",thin), gelman.diag(mpost$Gamma,multivariate=FALSE)$psrf)
# Phylogeny
assign(paste0(\"psrf.rho.\", modelname,\"_\",sample,\"_\",thin), gelman.diag(mpost$Rho,multivariate=FALSE)$psrf)
# Write convergence data
save(%s, %s, %s, file=convname)
# Save model fit object
save(m, cv, file=fitname)
'
", jobname, minutes, modelname, model, fitname, convname, sample, thin, nChains, psrf.beta.name, psrf.gamma.name, psrf.rho.name)
writeLines(code, executablename)
}Upload the produced hmsc/exe_XXXXX.sh files to the HPC respecting the directory structure.
7.7 Assess chaing convergence
Convergence diagnostic values substantially above 1 indicate lack of convergence. Values below 1.1 are considered good enough
# Load all conv file available in the hmsc folder
list.files(path = "hmsc", pattern = "^conv_", full.names = TRUE, include.dirs = TRUE) %>%
lapply(.,load,.GlobalEnv)
# Create a merged psrf.beta (genome) plot
ls() %>%
grep("^psrf\\.beta", ., value = TRUE) %>%
map_dfr(~ {
mat <- get(.x)
data.frame(modelchain = .x, as.data.frame(mat, , stringsAsFactors = FALSE)) %>%
rownames_to_column(var="parameter") %>%
mutate(model = str_split(modelchain, "_") %>% map_chr(1) %>% gsub("psrf.beta.","",.)) %>%
mutate(sample = str_split(modelchain, "_") %>% map_chr(2)) %>% #extract sample info from model name
mutate(thin = str_split(modelchain, "_") %>% map_chr(3)) #extract thin info from model name
}) %>%
ggplot(.,aes(x=reorder(modelchain,-Point.est.,fun=function(x) {quantile(x, probs = 0.9)}),y=Point.est.)) +
geom_violin(fill="#b8d9e3", color="#328da8") +
geom_jitter(alpha=0.3,size=0.2, color="#a8babf") +
stat_summary(fun=function(x) {quantile(x, probs = 0.9)}, geom="crossbar", width=0.2, color="orange") +
geom_hline(yintercept=1.1, linetype="dashed", color = "red") +
ylim(0.9,2)+
labs(x="Model chains",y="Parameter estimates")+
theme_classic()
# Create a merged psrf.gamma (trait) plot
ls() %>%
grep("^psrf\\.gamma", ., value = TRUE) %>%
map_dfr(~ {
mat <- get(.x)
data.frame(modelchain = .x, as.data.frame(mat, , stringsAsFactors = FALSE)) %>%
rownames_to_column(var="parameter") %>%
mutate(model = str_split(modelchain, "_") %>% map_chr(1) %>% gsub("psrf.gamma.","",.)) %>%
mutate(sample = str_split(modelchain, "_") %>% map_chr(2)) %>% #extract sample info from model name
mutate(thin = str_split(modelchain, "_") %>% map_chr(3)) #extract thin info from model name
}) %>%
ggplot(.,aes(x=reorder(modelchain,-Point.est.,fun=function(x) {quantile(x, probs = 0.9)}),y=Point.est.)) +
geom_violin(fill="#b8d9e3", color="#328da8") +
geom_jitter(alpha=0.3,size=0.2, color="#a8babf") +
stat_summary(fun=function(x) {quantile(x, probs = 0.9)}, geom="crossbar", width=0.2, color="orange") +
geom_hline(yintercept=1.1, linetype="dashed", color = "red") +
ylim(0.9,2)+
labs(x="Model chains",y="Parameter estimates")+
theme_classic()
# Create a merged psrf.rho (phylogeny) plot
ls() %>%
grep("^psrf\\.rho", ., value = TRUE) %>%
map_dfr(~ {
mat <- get(.x)
data.frame(modelchain = .x, as.data.frame(mat, , stringsAsFactors = FALSE)) %>%
rownames_to_column(var="parameter") %>%
mutate(model = str_split(modelchain, "_") %>% map_chr(1) %>% gsub("psrf.beta.","",.)) %>%
mutate(sample = str_split(modelchain, "_") %>% map_chr(2)) %>% #extract sample info from model name
mutate(thin = str_split(modelchain, "_") %>% map_chr(3)) #extract thin info from model name
}) %>%
ggplot(.,aes(x=reorder(modelchain,-Point.est.,fun=function(x) {quantile(x, probs = 0.9)}),y=Point.est.)) +
geom_violin(fill="#b8d9e3", color="#328da8") +
geom_jitter(alpha=0.3,size=0.2, color="#a8babf") +
stat_summary(fun=function(x) {quantile(x, probs = 0.9)}, geom="crossbar", width=0.2, color="orange") +
geom_hline(yintercept=1.1, linetype="dashed", color = "red") +
ylim(0.9,2)+
labs(x="Model chains",y="Parameter estimates")+
theme_classic()7.9 Variance partitioning
# Compute variance partitioning
varpart=computeVariancePartitioning(m)
varpart$vals %>%
as.data.frame() %>%
rownames_to_column(var="variable") %>%
pivot_longer(!variable, names_to = "genome", values_to = "value") %>%
mutate(variable=factor(variable, levels=c("day","treatment","day:treatment","logseqdepth","Random: sample"))) %>%
group_by(variable) %>%
summarise(mean=mean(value)*100,sd=sd(value)*100) %>%
tt()| variable | mean | sd |
|---|---|---|
| day | 10.685484 | 9.075688 |
| treatment | 32.006682 | 10.090640 |
| day:treatment | 52.969849 | 10.087036 |
| logseqdepth | 4.337985 | 5.659128 |
# Basal tree
varpart_tree <- genome_tree
#Varpart table
varpart_table <- varpart$vals %>%
as.data.frame() %>%
rownames_to_column(var="variable") %>%
pivot_longer(!variable, names_to = "genome", values_to = "value") %>%
mutate(genome=factor(genome, levels=rev(varpart_tree$tip.label))) %>%
mutate(variable=factor(variable, levels=rev(c("day","treatment","day:treatment","logseqdepth","Random: sample"))))
#Phylums
phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
filter(genome %in% varpart_tree$tip.label) %>%
arrange(match(genome, varpart_tree$tip.label)) %>%
mutate(phylum = factor(phylum, levels = unique(phylum))) %>%
column_to_rownames(var = "genome") %>%
select(phylum)
colors_alphabetic <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
filter(genome %in% varpart_tree$tip.label) %>%
arrange(match(genome, varpart_tree$tip.label)) %>%
select(phylum, colors) %>%
unique() %>%
arrange(phylum) %>%
select(colors) %>%
pull()
# Basal ggtree
varpart_tree <- varpart_tree %>%
force.ultrametric(.,method="extend") %>%
ggtree(., size = 0.3)***************************************************************
* Note: *
* force.ultrametric does not include a formal method to *
* ultrametricize a tree & should only be used to coerce *
* a phylogeny that fails is.ultrametric due to rounding -- *
* not as a substitute for formal rate-smoothing methods. *
***************************************************************
# Add phylum colors next to the tree tips
varpart_tree <- gheatmap(varpart_tree, phylum_colors, offset=-0.2, width=0.1, colnames=FALSE) +
scale_fill_manual(values=colors_alphabetic)+
labs(fill="Phylum")
#Reset fill scale to use a different colour profile in the heatmap
varpart_tree <- varpart_tree + new_scale_fill()
# Add variance stacked barplot
vertical_tree <- varpart_tree +
scale_fill_manual(values=c("#cccccc","#ed8a45","#be3e2b","#f6de6c"))+
geom_fruit(
data=varpart_table,
geom=geom_bar,
mapping = aes(x=value, y=genome, fill=variable, group=variable),
pwidth = 2,
offset = 0.05,
width= 1,
orientation="y",
stat="identity")+
labs(fill="Variable")
vertical_tree
## Model fit
[1] 0.3506152
predictive_genomes <- tibble(genome=m$spNames, r2 = MFCV[[2]]) %>%
filter(r2>=0.3)
# Abundance covered by predictive genomes
genome_counts_filt %>%
mutate_if(is.numeric, ~ . / sum(.)) %>%
right_join(predictive_genomes, by="genome") %>%
select(-c(genome,r2)) %>%
colSums() %>%
mean(., na.rm=TRUE)[1] 0.5068659
var_pred_table <- tibble(mag=m$spNames,
pred=MFCV$R2,
var_pred=MFCV$R2 * varpart$vals[1,],
support=getPostEstimate(hM=m, parName="Beta")$support %>% .[2,],
estimate=getPostEstimate(hM=m, parName="Beta")$mean %>% .[2,]) %>%
mutate(enrichment=ifelse(support>=support_threshold,"Feral","Neutral")) %>%
mutate(enrichment=ifelse(support<=negsupport_threshold,"Domestic",enrichment))
predictive_mags <- var_pred_table %>%
filter(var_pred>=0.005) %>%
pull(mag)7.10 Posterior estimates
# Select desired support threshold
support=0.9
negsupport=1-support
# Basal tree
postestimates_tree <- genome_tree
# Posterior estimate table
post_beta <- getPostEstimate(hM=m, parName="Beta")$support %>%
as.data.frame() %>%
mutate(variable=m$covNames) %>%
pivot_longer(!variable, names_to = "genome", values_to = "value") %>%
mutate(genome=factor(genome, levels=rev(postestimates_tree$tip.label))) %>%
mutate(value = case_when(
value >= support ~ "Positive",
value <= negsupport ~ "Negative",
TRUE ~ "Neutral")) %>%
mutate(value=factor(value, levels=c("Positive","Neutral","Negative"))) %>%
pivot_wider(names_from = variable, values_from = value) %>%
#select(genome,sp_vulgaris,area_semi,area_urban,sp_vulgarisxarea_semi,sp_vulgarisxarea_urban,season_spring,season_winter,sp_vulgarisxseason_spring,sp_vulgarisxseason_winter) %>%
column_to_rownames(var="genome")
#Phylums
phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
filter(genome %in% postestimates_tree$tip.label) %>%
arrange(match(genome, postestimates_tree$tip.label)) %>%
mutate(phylum = factor(phylum, levels = unique(phylum))) %>%
column_to_rownames(var = "genome") %>%
select(phylum)
colors_alphabetic <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
filter(genome %in% postestimates_tree$tip.label) %>%
arrange(match(genome, postestimates_tree$tip.label)) %>%
select(phylum, colors) %>%
unique() %>%
arrange(phylum) %>%
select(colors) %>%
pull()
# Basal ggtree
postestimates_tree <- postestimates_tree %>%
force.ultrametric(.,method="extend") %>%
ggtree(., size = 0.3)***************************************************************
* Note: *
* force.ultrametric does not include a formal method to *
* ultrametricize a tree & should only be used to coerce *
* a phylogeny that fails is.ultrametric due to rounding -- *
* not as a substitute for formal rate-smoothing methods. *
***************************************************************
#Add phylum colors next to the tree tips
postestimates_tree <- gheatmap(postestimates_tree, phylum_colors, offset=-0.2, width=0.1, colnames=FALSE) +
scale_fill_manual(values=colors_alphabetic)+
labs(fill="Phylum")
#Reset fill scale to use a different colour profile in the heatmap
postestimates_tree <- postestimates_tree + new_scale_fill()
# Add posterior significant heatmap
postestimates_tree <- gheatmap(postestimates_tree, post_beta, offset=0, width=0.5, colnames=TRUE, colnames_position="top",colnames_angle=90, colnames_offset_y=1, hjust=0) +
scale_fill_manual(values=c("#be3e2b","#f4f4f4","#b2b530"))+
labs(fill="Trend")
postestimates_tree +
vexpand(.25, 1) # expand top
## Predict responses
# Select modelchain of interest
load("hmsc/fit_model1_250_10.Rdata")
gradient = c(7:35)
gradientlength = length(gradient)
#Treatment-specific gradient predictions
pred_TG1 <- constructGradient(m,
focalVariable = "day",
non.focalVariables = list(logseqdepth=list(1),treatment=list(3,"TG1")),
ngrid=gradientlength) %>%
predict(m, Gradient = ., expected = TRUE) %>%
do.call(rbind,.) %>%
as.data.frame() %>%
mutate(day=rep(gradient,1000)) %>%
mutate(treatment=rep("TG1",gradientlength*1000)) %>%
pivot_longer(-c(day,treatment), names_to = "genome", values_to = "value")
pred_TG2 <- constructGradient(m,
focalVariable = "day",
non.focalVariables = list(logseqdepth=list(1),treatment=list(3,"TG2")),
ngrid=gradientlength) %>%
predict(m, Gradient = ., expected = TRUE) %>%
do.call(rbind,.) %>%
as.data.frame() %>%
mutate(day=rep(gradient,1000)) %>%
mutate(treatment=rep("TG2",gradientlength*1000)) %>%
pivot_longer(-c(day,treatment), names_to = "genome", values_to = "value")
pred_TG3 <- constructGradient(m,
focalVariable = "day",
non.focalVariables = list(logseqdepth=list(1),treatment=list(3,"TG3")),
ngrid=gradientlength) %>%
predict(m, Gradient = ., expected = TRUE) %>%
do.call(rbind,.) %>%
as.data.frame() %>%
mutate(day=rep(gradient,1000)) %>%
mutate(treatment=rep("TG3",gradientlength*1000)) %>%
pivot_longer(-c(day,treatment), names_to = "genome", values_to = "value")
pred_TG4 <- constructGradient(m,
focalVariable = "day",
non.focalVariables = list(logseqdepth=list(1),treatment=list(3,"TG4")),
ngrid=gradientlength) %>%
predict(m, Gradient = ., expected = TRUE) %>%
do.call(rbind,.) %>%
as.data.frame() %>%
mutate(day=rep(gradient,1000)) %>%
mutate(treatment=rep("TG4",gradientlength*1000)) %>%
pivot_longer(-c(day,treatment), names_to = "genome", values_to = "value")
pred_TG5 <- constructGradient(m,
focalVariable = "day",
non.focalVariables = list(logseqdepth=list(1),treatment=list(3,"TG5")),
ngrid=gradientlength) %>%
predict(m, Gradient = ., expected = TRUE) %>%
do.call(rbind,.) %>%
as.data.frame() %>%
mutate(day=rep(gradient,1000)) %>%
mutate(treatment=rep("TG5",gradientlength*1000)) %>%
pivot_longer(-c(day,treatment), names_to = "genome", values_to = "value")
pred_all <- rbind(pred_TG1,pred_TG2,pred_TG3,pred_TG4,pred_TG5)7.10.1 Function level
functions_table <- genome_gifts %>%
to.elements(., GIFT_db) %>%
to.functions(., GIFT_db) %>%
as.data.frame()
community_functions <- pred_all %>%
group_by(treatment, day, genome) %>%
mutate(row_id = row_number()) %>%
pivot_wider(names_from = genome, values_from = value) %>%
ungroup() %>%
group_split(row_id) %>%
as.list() %>%
lapply(., FUN = function(x){x %>%
mutate(day=str_c(day,"_",treatment)) %>%
dplyr::select(-c(row_id,treatment)) %>%
column_to_rownames(var = "day") %>%
as.data.frame() %>%
exp() %>%
t() %>%
tss() %>%
to.community(functions_table,.,GIFT_db) %>%
as.data.frame() %>%
rownames_to_column(var="day") %>%
separate(day, into=c("day","treatment"), sep="_")
})7.10.2 Biosynthesis
do.call(rbind, community_functions) %>%
pivot_longer(!c(day,treatment), names_to = "GIFT", values_to = "value") %>%
mutate(day=as.numeric(day)) %>%
filter(GIFT %in% c("B02","B04","B07","B08")) %>%
ggplot(aes(x=day, y=value, group=day))+
geom_boxplot(outlier.shape = NA) +
facet_nested(GIFT ~ treatment, scales="free")7.10.3 Degradation
do.call(rbind, community_functions) %>%
pivot_longer(!c(day,treatment), names_to = "GIFT", values_to = "value") %>%
mutate(day=as.numeric(day)) %>%
filter(GIFT %in% c("D03","D05","D07","D09")) %>%
ggplot(aes(x=day, y=value, group=day))+
geom_boxplot(outlier.shape = NA) +
facet_nested(GIFT ~ treatment, scales="free")7.10.4 Element level
elements_table <- genome_gifts %>%
to.elements(., GIFT_db) %>%
as.data.frame()
community_elements <- pred_all %>%
group_by(treatment, day, genome) %>%
mutate(row_id = row_number()) %>%
pivot_wider(names_from = genome, values_from = value) %>%
ungroup() %>%
group_split(row_id) %>%
as.list() %>%
lapply(., FUN = function(x){x %>%
mutate(day=str_c(day,"_",treatment)) %>%
dplyr::select(-c(row_id,treatment)) %>%
column_to_rownames(var = "day") %>%
as.data.frame() %>%
exp() %>%
t() %>%
tss() %>%
to.community(elements_table,.,GIFT_db) %>%
as.data.frame() %>%
rownames_to_column(var="day") %>%
separate(day, into=c("day","treatment"), sep="_")
})
community_elements <- do.call(rbind, community_elements)