This document is otherwise identical to HFD_diversity.Rmd but starts by importing a version of the OTU table that lacks Lactococcus OTUs.
library(tidyverse)
library(readxl)
library(MicrobeR)
library(vegan)
library(phyloseq)
library(philr)
library(doParallel)
library(UpSetR)
library(grid)
library(ggtern)
library(ggtree)
library(ape)
library(picante)
library(randomForest)
library(ROCR)
NSLOTS=6
registerDoParallel(cores=NSLOTS)
knitr::opts_chunk$set(echo = TRUE, message=FALSE, warning=FALSE, tidy=FALSE, cache=FALSE)
HFDcolor="#E69F00"
LFDcolor="#0072B2"
sessionInfo()
## R version 3.5.3 (2019-03-11)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ROCR_1.0-7 gplots_3.0.1 randomForest_4.6-14
## [4] picante_1.7 nlme_3.1-137 ape_5.2
## [7] ggtree_1.12.0 ggtern_3.0.0 UpSetR_1.4.0
## [10] doParallel_1.0.11 iterators_1.0.9 foreach_1.4.4
## [13] philr_1.6.0 phyloseq_1.24.0 vegan_2.5-2
## [16] lattice_0.20-38 permute_0.9-4 MicrobeR_0.3.2
## [19] readxl_1.1.0 forcats_0.3.0 stringr_1.3.1
## [22] dplyr_0.7.5 purrr_0.2.5 readr_1.1.1
## [25] tidyr_0.8.1 tibble_1.4.2 ggplot2_3.0.0
## [28] tidyverse_1.2.1
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.3-2 rprojroot_1.3-2 XVector_0.20.0
## [4] rstudioapi_0.7 DT_0.4 bit64_0.9-7
## [7] lubridate_1.7.4 xml2_1.2.0 codetools_0.2-16
## [10] splines_3.5.3 mnormt_1.5-5 robustbase_0.93-2
## [13] knitr_1.20 ade4_1.7-11 jsonlite_1.5
## [16] broom_0.4.4 cluster_2.0.7-1 latex2exp_0.4.0
## [19] compiler_3.5.3 httr_1.3.1 rvcheck_0.1.0
## [22] backports_1.1.2 assertthat_0.2.0 Matrix_1.2-15
## [25] lazyeval_0.2.1 cli_1.0.0 htmltools_0.3.6
## [28] tools_3.5.3 bindrcpp_0.2.2 igraph_1.2.1
## [31] gtable_0.2.0 glue_1.2.0 reshape2_1.4.3
## [34] fastmatch_1.1-0 Rcpp_0.12.19 Biobase_2.40.0
## [37] cellranger_1.1.0 Biostrings_2.48.0 zCompositions_1.1.1
## [40] multtest_2.36.0 gdata_2.18.0 DECIPHER_2.8.1
## [43] psych_1.8.4 tensorA_0.36.1 proto_1.0.0
## [46] rvest_0.3.2 phangorn_2.4.0 gtools_3.5.0
## [49] DEoptimR_1.0-8 zlibbioc_1.26.0 MASS_7.3-51.1
## [52] scales_0.5.0 rtk_0.2.5.7 hms_0.4.2
## [55] biomformat_1.10.1 rhdf5_2.24.0 yaml_2.2.0
## [58] memoise_1.1.0 gridExtra_2.3 NADA_1.6-1
## [61] stringi_1.2.3 RSQLite_2.1.1 S4Vectors_0.18.3
## [64] energy_1.7-5 tidytree_0.1.9 caTools_1.17.1
## [67] BiocGenerics_0.26.0 boot_1.3-20 truncnorm_1.0-8
## [70] bitops_1.0-6 compositions_1.40-2 rlang_0.2.1
## [73] pkgconfig_2.0.1 evaluate_0.10.1 Rhdf5lib_1.2.1
## [76] bindr_0.1.1 treeio_1.4.1 htmlwidgets_1.2
## [79] bit_1.1-14 tidyselect_0.2.4 plyr_1.8.4
## [82] magrittr_1.5 R6_2.2.2 IRanges_2.14.10
## [85] DBI_1.0.0 pillar_1.2.3 haven_1.1.1
## [88] foreign_0.8-71 withr_2.1.2 mgcv_1.8-27
## [91] survival_2.43-3 bayesm_3.1-0.1 modelr_0.1.2
## [94] crayon_1.3.4 KernSmooth_2.23-15 plotly_4.7.1
## [97] rmarkdown_1.10 data.table_1.11.4 blob_1.1.1
## [100] digest_0.6.15 stats4_3.5.3 munsell_0.5.0
## [103] viridisLite_0.3.0 quadprog_1.5-5
Will generate multiple versions of the OTU table in a list which will include a filtered (remove noisy features), 0-replaced, CLR-normalized, subsampled, multiple-subsampled, subsampled to 5000 and samples with less reads removed, and PhILR-normalized versions.
metadata<-read_tsv("/Volumes/turnbaughlab/qb3share/jbisanz/HFD_metastudy/CollatedData/metadata_filtered.tsv") %>% mutate(SampleID=DB_Sample) %>% as.data.frame()
rownames(metadata)<-metadata$DB_Sample
taxonomy<-read_tsv("/Volumes/turnbaughlab/qb3share/jbisanz/HFD_metastudy/dbs/gg_13_8_otus/taxonomy/97_otu_taxonomy.txt", col_names = c("OTU","Taxonomy")) %>%
separate(Taxonomy,
c("Kingdom",
"Phylum",
"Class",
"Order",
"Family",
"Genus",
"Species"
),
sep="; ",
remove=FALSE) %>%
as.data.frame()
rownames(taxonomy)<-taxonomy$OTU
OTUs<-list()
OTUs$Raw<-read.table("/Volumes/turnbaughlab/qb3share/jbisanz/HFD_metastudy/CollatedData/OTUs_nol.tsv", header=T, sep='\t', row.names=1)
taxonomy<-subset(taxonomy, OTU %in% rownames(OTUs$Raw))
OTUs$Filtered<-Confidence.Filter(OTUs$Raw, 3, 10, TRUE)
OTUs$CZM<-zCompositions::cmultRepl(t(OTUs$Filtered),
label=0,
method="CZM",
output="counts",
suppress.print=TRUE) %>%
t() #with a non-prior based 0 replacement
OTUs$CLR<-apply(log2(OTUs$CZM), 2, function(x)x-mean(x))
OTUs$Subsampled<-Subsample.Table(OTUs$Filtered, VERBOSE=T)
OTUs$min5kSubsampled<-OTUs$Filtered[,colSums(OTUs$Filtered)>=5000]
dim(OTUs$min5kSubsampled)
## [1] 14589 923
OTUs$min5kSubsampled<-Subsample.Table(OTUs$Filtered, VERBOSE=T, DEPTH=5000)
OTUs$multiSubsampled<-Multiple.Subsample.Table(FEATURES=OTUs$Filtered, VERBOSE=T, THREADS = NSLOTS, NSAMPS = 51)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 15.00 64.00 84.00 77.27 92.00 138.00
trees<-list()
trees$Raw<-read_tree_greengenes("../../dbs/gg_13_8_otus/trees/97_otus.tree")
trees$Raw$edge.length[is.nan(trees$Raw$edge.length)]<-0 # workaround for import error on a single edge https://forum.qiime2.org/t/greengenes-tree-download-with-branch-lengths/3329/5 a la qiime1
trees$Raw<-drop.tip(trees$Raw, tip=trees$Raw$tip.label[!trees$Raw$tip.label %in% rownames(OTUs$Raw)])
trees$PhILR<-drop.tip(trees$Raw, tip=trees$Raw$tip.label[!trees$Raw$tip.label %in% rownames(OTUs$Filtered)])
trees$PhILR<-makeNodeLabel(trees$PhILR, method="number", prefix="n")
trees$Subsampled<-drop.tip(trees$Raw, tip=trees$Raw$tip.label[!trees$Raw$tip.label %in% rownames(OTUs$Subsampled)])
trees$min5kSubsampled<-drop.tip(trees$Raw, tip=trees$Raw$tip.label[!trees$Raw$tip.label %in% rownames(OTUs$min5kSubsampled)])
trees$multiSubsampled<-drop.tip(trees$Raw, tip=trees$Raw$tip.label[!trees$Raw$tip.label %in% rownames(OTUs$multiSubsampled)])
OTUs$PhILR<-t(philr(t(OTUs$CZM), trees$PhILR, part.weights='enorm.x.gm.counts', ilr.weights='blw.sqrt'))
saveRDS(OTUs, "RDS/OTUs.RDS")
saveRDS(trees, "RDS/trees.RDS")
saveRDS(taxonomy, "RDS/taxonomy.RDS")
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4502332 240.5 7266798 388.1 NA 7266798 388.1
## Vcells 109680061 836.8 1348380823 10287.4 16384 1680471828 12821.0
OTUs$TaxaSummary<-Summarize.Taxa(OTUs$Filtered, taxonomy)
phab<-
OTUs$TaxaSummary$Phylum %>%
Make.Percent() %>%
as.data.frame() %>%
rownames_to_column("Phylum") %>%
as.tibble() %>%
mutate(Phylum=gsub("..+p__","", Phylum)) %>%
gather(-Phylum, key="SampleID", value="Abundance")
tp<-
phab %>% group_by(Phylum) %>% summarize(mean=mean(Abundance)) %>%
arrange(desc(mean)) %>%
top_n(9, mean) %>%
bind_rows(., tibble(Phylum="Other", mean=0))
por<-
phab %>%
filter(Phylum=="Firmicutes") %>%
arrange(desc(Abundance)) %>%
pull(SampleID)
phab %>%
mutate(Phylum=if_else(Phylum %in% tp$Phylum, Phylum, "Other")) %>%
mutate(Phylum=factor(Phylum, levels = rev(tp$Phylum))) %>%
left_join(metadata) %>%
#filter(Colonization=="SPF" | Colonization=="MUSD") %>%
mutate(SampleID=factor(SampleID, levels=por)) %>%
ggplot(aes(x=SampleID, y=Abundance, fill=Phylum)) +
geom_bar(stat="identity") +
facet_grid(~Diet_Classification, scales="free_x", space = "free") +
theme_MicrobeR() +
theme(axis.text.x = element_blank()) +
theme(axis.ticks.x = element_blank()) +
coord_cartesian(expand=F) +
xlab("Sample") +
ylab("Phylum Abundance (%)") +
scale_fill_manual(values=rev(c(
"blue4",
"olivedrab",
"firebrick",
"gold",
"darkorchid",
"steelblue2",
"chartreuse1",
"aquamarine",
"coral",
"grey"
)))
ggsave("figures/Phylumplot.pdf", device="pdf", height=3, width=6)
rm(phab, tp, por)
uplot<-
OTUs$Raw %>%
as.data.frame() %>%
rownames_to_column("OTU") %>%
as_tibble() %>%
gather(-OTU, key=SampleID, value=Count) %>%
mutate(Count=if_else(Count==0, 0, 1)) %>%
left_join(metadata[,c("SampleID","StudyID")]) %>%
select(-SampleID) %>%
group_by(StudyID, OTU) %>%
summarize(Count=max(Count))
uplot<-
uplot %>%
spread(key=StudyID, value=Count, fill=0) %>%
as.data.frame() %>%
upset(., nsets=30, nintersects=300, order.by="freq", show.numbers=F, matrix.dot.alpha=0)
uplot
pdf("figures/upset.pdf", height=11, width=20, useDingbats=F)
print(uplot)
dev.off()
## quartz_off_screen
## 2
rm(uplot)
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4735475 253.0 7266798 388.1 NA 7266798 388.1
## Vcells 130610151 996.5 1078704658 8229.9 16384 1680471828 12821.0
Then will generate the CLR-euclidian, PhILR-euclidian, weighted and unweighted unifrac, Bray Curtis, and JSD metrics for each of the versions of the OTU tables created above.
DistanceMatrices<-list()
DistanceMatrices$Subsampled<-list()
DistanceMatrices$min5kSubsampled<-list()
DistanceMatrices$multiSubsampled<-list()
DistanceMatrices$Subsampled[["PhILR Euclidian"]]<-dist(t(OTUs$PhILR), method="euclidian"); gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4724445 252.4 7266798 388.1 NA 7266798 388.1
## Vcells 131121183 1000.4 862963726 6583.9 16384 1680471828 12821.0
DistanceMatrices$Subsampled[["CLR Euclidian"]]<-dist(t(OTUs$CLR), method="euclidian"); gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4724460 252.4 7266798 388.1 NA 7266798 388.1
## Vcells 131726752 1005.0 690370980 5267.2 16384 1680471828 12821.0
DistanceMatrices$Subsampled[["Bray Curtis"]]<-vegdist(t(Make.Proportion(OTUs$Subsampled)), method="bray"); gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4724726 252.4 7266798 388.1 NA 7266798 388.1
## Vcells 132333861 1009.7 552296784 4213.7 16384 1680471828 12821.0
DistanceMatrices$Subsampled[["Jensen-Shannon Divergence"]]<-phyloseq::distance(phyloseq(otu_table(Make.Proportion(OTUs$Subsampled), taxa_are_rows = T)), method="jsd", parallel=T); gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 6548797 349.8 13956171 745.4 NA 13497839 720.9
## Vcells 134429255 1025.7 441837427 3371.0 16384 1680471828 12821.0
DistanceMatrices$Subsampled[["weighted UniFrac"]]<-UniFrac(phyloseq(otu_table(Make.Proportion(OTUs$Subsampled), taxa_are_rows = T), phy=trees$Subsampled), weighted=T, parallel=T); gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 6550814 349.9 13956171 745.4 NA 13956171 745.4
## Vcells 135040008 1030.3 441837427 3371.0 16384 1680471828 12821.0
DistanceMatrices$Subsampled[["unweighted UniFrac"]]<-UniFrac(phyloseq(otu_table(Make.Proportion(OTUs$Subsampled), taxa_are_rows = T), phy=trees$Subsampled), weighted=F, parallel=T); gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 6550833 349.9 13956171 745.4 NA 13956171 745.4
## Vcells 135646694 1035.0 441837427 3371.0 16384 1680471828 12821.0
DistanceMatrices$min5kSubsampled[["PhILR Euclidian"]]<-dist(t(OTUs$PhILR[,colnames(OTUs$min5kSubsampled)]), method="euclidian"); gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 6550882 349.9 13956171 745.4 NA 13956171 745.4
## Vcells 136073168 1038.2 441837427 3371.0 16384 1680471828 12821.0
DistanceMatrices$min5kSubsampled[["CLR Euclidian"]]<-dist(t(OTUs$CLR[,colnames(OTUs$min5kSubsampled)]), method="euclidian"); gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 6550919 349.9 13956171 745.4 NA 13956171 745.4
## Vcells 136499633 1041.5 441837427 3371.0 16384 1680471828 12821.0
DistanceMatrices$min5kSubsampled[["Bray Curtis"]]<-vegdist(t(Make.Proportion(OTUs$min5kSubsampled)), method="bray"); gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 6550949 349.9 13956171 745.4 NA 13956171 745.4
## Vcells 136926100 1044.7 441837427 3371.0 16384 1680471828 12821.0
DistanceMatrices$min5kSubsampled[["Jensen-Shannon Divergence"]]<-phyloseq::distance(phyloseq(otu_table(Make.Proportion(OTUs$min5kSubsampled), taxa_are_rows = T)), method="jsd", parallel=T); gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 6550965 349.9 13956171 745.4 NA 13956171 745.4
## Vcells 137352556 1048.0 441837427 3371.0 16384 1680471828 12821.0
DistanceMatrices$min5kSubsampled[["weighted UniFrac"]]<-UniFrac(phyloseq(otu_table(Make.Proportion(OTUs$min5kSubsampled), taxa_are_rows = T), phy=trees$min5kSubsampled), weighted=T, parallel=T); gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 6550984 349.9 13956171 745.4 NA 13956171 745.4
## Vcells 137779025 1051.2 441837427 3371.0 16384 1680471828 12821.0
DistanceMatrices$min5kSubsampled[["unweighted UniFrac"]]<-UniFrac(phyloseq(otu_table(Make.Proportion(OTUs$min5kSubsampled), taxa_are_rows = T), phy=trees$min5kSubsampled), weighted=F, parallel=T); gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 6551003 349.9 13956171 745.4 NA 13956171 745.4
## Vcells 138205486 1054.5 441837427 3371.0 16384 1680471828 12821.0
DistanceMatrices$multiSubsampled[["PhILR Euclidian"]]<-dist(t(OTUs$PhILR[,colnames(OTUs$multiSubsampled)]), method="euclidian"); gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 6551051 349.9 13956171 745.4 NA 13956171 745.4
## Vcells 138812185 1059.1 441837427 3371.0 16384 1680471828 12821.0
DistanceMatrices$multiSubsampled[["CLR Euclidian"]]<-dist(t(OTUs$CLR[,colnames(OTUs$multiSubsampled)]), method="euclidian"); gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 6551088 349.9 13956171 745.4 NA 13956171 745.4
## Vcells 139418875 1063.7 441837427 3371.0 16384 1680471828 12821.0
DistanceMatrices$multiSubsampled[["Bray Curtis"]]<-vegdist(t(Make.Proportion(OTUs$multiSubsampled)), method="bray"); gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 6551117 349.9 13956171 745.4 NA 13956171 745.4
## Vcells 140024466 1068.4 441837427 3371.0 16384 1680471828 12821.0
DistanceMatrices$multiSubsampled[["Jensen-Shannon Divergence"]]<-phyloseq::distance(phyloseq(otu_table(Make.Proportion(OTUs$multiSubsampled), taxa_are_rows = T)), method="jsd", parallel=T); gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 6551133 349.9 13956172 745.4 NA 13956172 745.4
## Vcells 141155435 1077.0 441837427 3371.0 16384 1680471828 12821.0
DistanceMatrices$multiSubsampled[["weighted UniFrac"]]<-UniFrac(phyloseq(otu_table(Make.Proportion(OTUs$multiSubsampled), taxa_are_rows = T), phy=trees$multiSubsampled), weighted=T, parallel=T); gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 6551152 349.9 19331441 1032.5 NA 13956172 745.4
## Vcells 141762129 1081.6 441837427 3371.0 16384 1680471828 12821.0
DistanceMatrices$multiSubsampled[["unweighted UniFrac"]]<-UniFrac(phyloseq(otu_table(Make.Proportion(OTUs$multiSubsampled), taxa_are_rows = T), phy=trees$multiSubsampled), weighted=F, parallel=T); gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 6551171 349.9 19331441 1032.5 NA 19331441 1032.5
## Vcells 142368815 1086.2 441837427 3371.0 16384 1680471828 12821.0
saveRDS(DistanceMatrices, "RDS/DistanceMatrices.RDS")
alldists<-
lapply(names(DistanceMatrices$multiSubsampled), function(dname){
DistanceMatrices$multiSubsampled[[dname]] %>%
as.matrix() %>%
as.data.frame() %>%
rownames_to_column("Subject") %>%
gather(-Subject, key="Match", value=Distance) %>%
mutate(Metric=dname)
}) %>%
do.call(bind_rows, .)
alldists %>%
mutate(Metric=factor(Metric, levels=c("PhILR Euclidian", "unweighted UniFrac","weighted UniFrac", "CLR Euclidian","Bray Curtis","Jensen-Shannon Divergence"))) %>%
ggplot(aes(x=Distance, color=Metric)) +
geom_freqpoly() +
theme_MicrobeR() +
theme(legend.position="none") +
facet_wrap(~Metric, scales="free")
ggsave("figures/disthist.pdf", device="pdf", width=7, height=5, useDingbats=F)
rm(alldists)
lapply(names(DistanceMatrices$multiSubsampled), function(x){
DistanceMatrices$multiSubsampled[[x]] %>%
as.vector() %>%
summary() %>%
broom::tidy() %>%
mutate(Metric=x) %>%
select(Metric, everything())
}) %>%
do.call(bind_rows, .) %>%
Nice.Table()
Because the non-phylogeneticly weighted metrics are so highly saturated in completely non-overlapping samples, will remove them from analysis of the all-sample analysis.
for (x in names(DistanceMatrices)){
DistanceMatrices[[x]]$`CLR Euclidian`<-NULL
DistanceMatrices[[x]]$`Bray Curtis`<-NULL
DistanceMatrices[[x]]$`Jensen-Shannon Divergence`<-NULL
}
Here only ordinating mouse samples that are from SPF mice, or gnotobiotic mice colonized with mouse communities.
mousesamps<-metadata %>% filter(Colonization %in% c("SPF")) %>% pull(SampleID)
PCoAs<-list()
for (y in names(DistanceMatrices)){
for(x in names(DistanceMatrices[[y]])){
tm<-mousesamps[mousesamps %in% colnames(as.matrix(DistanceMatrices[[y]][[x]]))]
message(x,"-", y)
PCoAs[[paste0(x,"-",y)]]<-
as.matrix(DistanceMatrices[[y]][[x]])[tm,tm] %>%
as.dist(.) %>%
ape::pcoa()
}
}
rm(tm, y, x)
lapply(names(PCoAs), function(x){
PCoAs[[x]]$values %>%
as.data.frame() %>%
rownames_to_column("Axis") %>%
as_tibble() %>%
mutate(Metric=x) %>%
mutate(Axis=as.numeric(Axis)) %>%
filter(Axis<=10)
}) %>%
do.call(bind_rows, .) %>%
mutate(Pvar=100*Relative_eig) %>%
separate(Metric, c("Metric","Dataset"), sep="-") %>%
ggplot(aes(x=Axis, y=Pvar, color=Metric)) +
geom_line() +
theme_MicrobeR() +
ylab("% variation explained") +
xlab("PC") +
scale_x_continuous(breaks=1:10) +
facet_wrap(~Dataset)
ggsave("figures/scree.pdf", device="pdf", width=9, height=3, useDingbats=F)
lapply(names(PCoAs), function(x){
PCoAs[[x]]$values %>%
as.data.frame() %>%
rownames_to_column("Axis") %>%
as_tibble() %>%
mutate(Metric=x) %>%
mutate(Axis=as.numeric(Axis)) %>%
filter(Axis<=10)
}) %>%
do.call(bind_rows, .) %>%
select(Metric, everything()) %>%
separate(Metric, c("Metric","Dataset"), sep="-") %>%
Nice.Table(.)
plotorder<-c("unweighted UniFrac","weighted UniFrac","PhILR Euclidian")
lapply(names(PCoAs), function(x) PCoAs[[x]]$vectors %>% as.data.frame() %>% rownames_to_column("SampleID") %>% select(SampleID, Axis.1, Axis.2, Axis.3) %>% mutate(Metric=x)) %>%
do.call(bind_rows, .) %>%
as_tibble() %>%
left_join(metadata) %>%
separate(Metric, c("Metric","Dataset"), sep="-") %>%
mutate(Metric=factor(Metric, levels=plotorder)) %>%
ggplot(aes(x=Axis.1, y=Axis.2)) +
geom_point(aes(color=Diet_Classification), shape=16, alpha=0.3) +
scale_color_manual(values=c(HFDcolor, LFDcolor)) +
theme_MicrobeR() +
facet_wrap(Dataset~Metric, scales="free", ncol=3) +
theme(panel.border = element_blank(), axis.line = element_line()) +
theme(axis.text = element_blank(), axis.ticks = element_blank()) +
xlab("") +
ylab("") +
theme(legend.position = "none")
ggsave("figures/PCoAs_diet.pdf", device="pdf", height=7.5, width=6, useDingbats=F)
lapply(names(PCoAs), function(x) PCoAs[[x]]$vectors %>% as.data.frame() %>% rownames_to_column("SampleID") %>% select(SampleID, Axis.1, Axis.2, Axis.3) %>% mutate(Metric=x)) %>%
do.call(bind_rows, .) %>%
as_tibble() %>%
left_join(metadata) %>%
separate(Metric, c("Metric","Dataset"), sep="-") %>%
mutate(Metric=factor(Metric, levels=plotorder)) %>%
ggplot(aes(x=Axis.2, y=Axis.3)) +
geom_point(aes(color=Diet_Classification), shape=16, alpha=0.3) +
scale_color_manual(values=c(HFDcolor, LFDcolor)) +
theme_MicrobeR() +
facet_wrap(Dataset~Metric, scales="free", ncol=3) +
theme(panel.border = element_blank(), axis.line = element_line()) +
theme(axis.text = element_blank(), axis.ticks = element_blank()) +
xlab("") +
ylab("") +
theme(legend.position = "none")
ggsave("figures/PCoAs_diet_2v3.pdf", device="pdf", height=7.5, width=6, useDingbats=F)
ps<-
lapply(names(PCoAs), function(x) PCoAs[[x]]$vectors %>% as.data.frame() %>% rownames_to_column("SampleID") %>% select(SampleID, Axis.1, Axis.2, Axis.3) %>% mutate(Metric=x)) %>%
do.call(bind_rows, .) %>%
as_tibble() %>%
left_join(metadata) %>%
separate(Metric, c("Metric","Dataset"), sep="-") %>%
mutate(Metric=factor(Metric, levels=plotorder)) %>%
ggplot(aes(x=Axis.1, y=Axis.2)) +
geom_point(aes(color=StudyID), shape=16, alpha=0.4) +
theme_MicrobeR() +
facet_wrap(Dataset~Metric, scales="free", ncol=3) +
theme(panel.border = element_blank(), axis.line = element_line()) +
theme(axis.text = element_blank(), axis.ticks = element_blank()) +
xlab("") +
ylab("") +
theme(legend.position = "none")
ps
ggsave("figures/PCoAs_study.pdf", device="pdf", height=7.5, width=6, useDingbats=F)
ps<-
lapply(names(PCoAs), function(x) PCoAs[[x]]$vectors %>% as.data.frame() %>% rownames_to_column("SampleID") %>% select(SampleID, Axis.1, Axis.2, Axis.3) %>% mutate(Metric=x)) %>%
do.call(bind_rows, .) %>%
as_tibble() %>%
left_join(metadata) %>%
separate(Metric, c("Metric","Dataset"), sep="-") %>%
mutate(Metric=factor(Metric, levels=plotorder)) %>%
ggplot(aes(x=Axis.2, y=Axis.3)) +
geom_point(aes(color=StudyID), shape=16, alpha=0.4) +
theme_MicrobeR() +
facet_wrap(Dataset~Metric, scales="free", ncol=3) +
theme(panel.border = element_blank(), axis.line = element_line()) +
theme(axis.text = element_blank(), axis.ticks = element_blank()) +
xlab("PC2") +
ylab("PC3") +
theme(legend.position = "none")
ps
ggsave("figures/PCoAs_study_2v3.pdf", device="pdf", height=7.5, width=6, useDingbats=F)
ps<-ps + theme(legend.position="right")
ps<-cowplot::get_legend(ps)
plot(ps)
ggsave("figures/PCoAs_study_legend.pdf", ps, device="pdf", height=5, width=3, useDingbats=F)
ADONIS<-tibble()
for(x in names(DistanceMatrices)){
for(y in names(DistanceMatrices[[x]])){
message(x,"-",y)
ts<-mousesamps[mousesamps %in% labels(DistanceMatrices[[x]][[y]])]
tm<-metadata %>% as.data.frame() %>% remove_rownames() %>% filter(SampleID %in% ts) %>% column_to_rownames("SampleID")
td<-as.matrix(DistanceMatrices[[x]][[y]])[ts,ts] %>% as.dist()
tm<-tm[labels(td),]
ta<-adonis(td~Diet_Classification+StudyID, data=tm, strata=tm$StudyID, permutations=999, parallel=NSLOTS)
message("ADONIS complete for ",x)
ta<-ta$aov.tab %>%
as.data.frame() %>%
rownames_to_column("Term") %>%
mutate(Metric=paste0(x,"-",y)) %>%
select(Metric, everything()) %>%
rename(Pvalue=`Pr(>F)`)
ADONIS<-bind_rows(ADONIS, ta)
rm(td, tm, ta, ts)
gc()
}
}
saveRDS(ADONIS, "RDS/ADONIS.RDS")
ADONIS<-ADONIS %>% separate(Metric, c("Data","Metric"), sep="-")
write_tsv(ADONIS, "figures/ADONIS_wstrata.txt")
Nice.Table(ADONIS)
Based on the data generated above, it appears clear that there is some effect of dietary fat on microbiota composition. This can be observed in the PCoA visualizations and the apparent differences in the Firmicutes content seen in the boxplots. These comparison inevitably required a high level of subsampling, and as such further analysis will focus on within study normalization.
rm(DistanceMatrices, PCoAs, ps, plotorder, x, mousesamps)
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 6645867 355.0 19331441 1032.5 NA 19331441 1032.5
## Vcells 132040063 1007.4 441837427 3371.0 16384 1680471828 12821.0
To conduct this analysis, will use one function to analyze each study using the same approach but with study-appropriate normalization
processStudy<-function(Study){
# PS is the per study object which is a named list with the following objects:
# Study
# Metadata
# AlphaDiversity
# AlphaDiversity_Stats
# DistanceMatrices
# PCoAs
# ADONISs
# FBratio
# OTUs_raw
# OTUs_subsampled
# OTUs_clr
# OTUs_PhILR
# OTUs_filtered
# Tree_GG
# Tree_PhILR
# Subsample_depth
PS<-list()
PS$Study<-Study
PS$Metadata<-subset(metadata, StudyID==Study)
PS$OTUs_raw<-OTUs$Raw[,PS$Metadata$SampleID]
PS$OTUs_raw<-PS$OTUs_raw[rowSums(PS$OTUs_raw)>=1,]
message(Study, " table has dimensions:", paste(dim(PS$OTUs_raw), collapse="x"))
PS$Subsample_depth<-min(colSums(PS$OTUs_raw))
PS$OTUs_subsampled<-Subsample.Table(PS$OTUs_raw, DEPTH = PS$Subsample_depth, VERBOSE = T)
PS$OTUs_filtered<-Confidence.Filter(PS$OTUs_raw, 3, 10)
PS$Tree_GG<-drop.tip(trees$Raw, trees$Raw$tip.label[!trees$Raw$tip.label %in% rownames(PS$OTUs_subsampled)])
message("FBratioing")
PS$FBratio<-
Summarize.Taxa(PS$OTUs_raw, taxonomy %>% as.data.frame() %>% remove_rownames() %>% column_to_rownames("OTU"))$Phylum %>%
Make.Percent() %>%
t() %>%
as.data.frame() %>%
rownames_to_column("SampleID") %>%
mutate(FBratio=(`k__Bacteria;p__Firmicutes`+0.1)/(`k__Bacteria;p__Bacteroidetes`+0.1)) %>%
select(SampleID, FBratio)
message("Alpha diversity")
PS$AlphaDiversity<-data.frame(
Shannon=vegan::diversity(PS$OTUs_subsampled, index="shannon", MARGIN=2),
Chao1=vegan::estimateR(t(PS$OTUs_subsampled)) %>% t() %>% as.data.frame %>% rename(Chao1=S.chao1) %>% pull(Chao1),
FaithsPD=picante::pd(t(PS$OTUs_subsampled), PS$Tree_GG, include.root=F)$PD
) %>% rownames_to_column("SampleID") %>%
select(SampleID, everything()) %>%
left_join(PS$FBratio) %>%
as_tibble() %>%
gather(-SampleID, key="Metric", value="Diversity") %>%
left_join(PS$Metadata[,c("SampleID","Diet_Classification", "StudyID")]) %>%
select(SampleID, StudyID, Diet_Classification, Metric, Diversity)
PS$AlphaDiversity <-
PS$AlphaDiversity %>%
left_join(
PS$AlphaDiversity %>%
group_by(Metric, Diet_Classification) %>%
summarize(mean=mean(log2(Diversity))) %>%
spread(key=Diet_Classification, value=mean) %>%
rename(mean_log2HFD=HFD, mean_log2LFD=LFD)
) %>%
mutate(log2FC=log2(Diversity)-mean_log2LFD)
PS$AlphaDiversity_Stats <-
PS$AlphaDiversity %>%
group_by(Metric) %>%
do(
broom::tidy(t.test(log2FC~Diet_Classification, data=., conf.int=TRUE, conf.level=0.95))
) %>%
mutate(StudyID=Study) %>%
select(StudyID, Metric, log2FC=estimate, Pvalue=p.value, mean_LFD=estimate2, mean_HFD=estimate1, CI_low=conf.low, CI_high=conf.high)
PS$OTUs_clr<-Make.CLR(PS$OTUs_filtered, CZM=TRUE)
PS$DistanceMatrices<-list()
message("PhILR Euclidian")
PS$Tree_PhILR<-drop.tip(trees$Raw, trees$Raw$tip.label[!trees$Raw$tip.label %in% rownames(PS$OTUs_filtered)])
PS$Tree_PhILR<-makeNodeLabel(PS$Tree_PhILR, method="number", prefix="n")
PS$OTUs_PhILR<-t(philr(t(PS$OTUs_filtered+0.5), PS$Tree_PhILR, part.weights='enorm.x.gm.counts', ilr.weights='blw.sqrt'))
PS$DistanceMatrices[["PhILR Euclidian"]]<-dist(t(PS$OTUs_PhILR), method="euclidian")
message("CLR Euclidian")
PS$DistanceMatrices[["CLR Euclidian"]]<-dist(t(PS$OTUs_clr), method="euclidian")
message("Bray Curtis")
PS$DistanceMatrices[["Bray Curtis"]]<-vegdist(t(Make.Proportion(PS$OTUs_subsampled)), method="bray")
message("Wunifrac")
PS$DistanceMatrices[["weighted UniFrac"]]<-UniFrac(phyloseq(otu_table(Make.Proportion(PS$OTUs_subsampled), taxa_are_rows = T), phy=PS$Tree_GG), weighted=T, parallel=T)
message("UWunifrac")
PS$DistanceMatrices[["unweighted UniFrac"]]<-UniFrac(phyloseq(otu_table(Make.Proportion(PS$OTUs_subsampled), taxa_are_rows = T), phy=PS$Tree_GG), weighted=F, parallel=T)
message("JSD")
PS$DistanceMatrices[["Jensen-Shannon divergence"]]<-phyloseq::distance(phyloseq(otu_table(Make.Proportion(PS$OTUs_subsampled), taxa_are_rows = T)), method="jsd", parallel=T)
message("PCoAing")
PS$PCoAs<-lapply(PS$DistanceMatrices, pcoa)
message("ADONISing")
PS$ADONISs<-lapply(PS$DistanceMatrices, function(x) adonis(x~PS$Metadata[match(labels(x), PS$Metadata$SampleID),]$Diet_Classification))
PS$ADONISs<-
lapply(names(PS$ADONISs), function(x){
PS$ADONISs[[x]]$aov.tab %>%
as.data.frame() %>%
rownames_to_column("Term") %>%
as.tibble() %>%
mutate(Metric=x) %>%
select(Metric, Term, R2, Pvalue=`Pr(>F)`) %>%
mutate(Term=if_else(Term=="PS$Metadata[match(labels(x), PS$Metadata$SampleID), ]$Diet_Classification", "Diet_Classification", Term))
}) %>%
do.call(bind_rows,.)
gc()
return(PS)
}
PerStudy<-list()
for(Study in (unique(metadata$StudyID))){
message("--------------------->", Study)
PerStudy[[Study]]<-processStudy(Study)
}
saveRDS(PerStudy, "RDS/PerStudy.RDS")
In the forest plot I will also include all studies pooled. Due to differences in baseline, I will use the log2 fold changes to pool studies and calculate the summed effect.
mod<-lapply(PerStudy, function(x) x$AlphaDiversity) %>%
do.call(bind_rows, .) %>%
mutate(Diet_Classification=factor(Diet_Classification, levels=c("LFD","HFD"))) %>%
filter(!StudyID %in% c("David 2014", "Wu 2011")) %>% #remove human samples
group_by(Metric)
AlphaCombined<-tibble(StudyID=character(0), Metric=character(0), log2FC=numeric(0), Pvalue=numeric(0), mean_LFD=numeric(0), mean_HFD=numeric(0), CI_low=numeric(0), CI_high=numeric(0))
for(i in unique(mod$Metric)){
fit<-lmerTest:::lmer(log2FC~Diet_Classification+(1|StudyID), data=subset(mod, Metric==i))
cf<-confint(fit,level = 0.95)
AlphaCombined<-bind_rows(AlphaCombined, tibble(
StudyID="Combined",
Metric=i,
log2FC=summary(fit)$coefficients["Diet_ClassificationHFD", "Estimate"],
Pvalue=anova(fit)$`Pr(>F)`,
mean_LFD=NA,
mean_HFD=NA,
CI_low=cf["Diet_ClassificationHFD",1],
CI_high=cf["Diet_ClassificationHFD",2]
))
}
NSamples<-
lapply(names(PerStudy), function(x) tibble(StudyID=x, Nsamples=length(unique(PerStudy[[x]]$AlphaDiversity$SampleID)))) %>%
do.call(bind_rows, .) %>%
arrange(desc(Nsamples)) %>%
mutate(Study=paste0(StudyID, " (n=", Nsamples,")")) %>%
bind_rows(tibble(StudyID="Combined", Study="Combined")) %>%
mutate(Study=factor(Study, levels=rev(Study)))
lapply(PerStudy, function(x) x$AlphaDiversity_Stats) %>%
do.call(bind_rows, .) %>%
filter(!StudyID %in% c("David 2014", "Wu 2011", "Anhe 2015")) %>% #Anhe 2015 removed as no replicates so any comparison is not really appropriate
bind_rows(AlphaCombined) %>%
mutate(Significant=case_when(
Pvalue<0.05 & log2FC>0 ~ "* up",
Pvalue<0.05 & log2FC<0 ~ "* down",
TRUE~"ns"
)) %>%
ungroup() %>%
mutate(Metric=factor(Metric, levels=c("Chao1","Shannon", "FaithsPD","FBratio"))) %>%
left_join(NSamples) %>%
ggplot(aes(x=log2FC, y=Study, color=Significant)) +
geom_vline(xintercept = 0, linetype="dashed", color="grey50") +
geom_errorbarh(aes(xmin=CI_low, xmax=CI_high), height=0 ) +
geom_point() +
facet_grid(~Metric, scales="free_x") +
theme_MicrobeR() +
scale_color_manual(values=c(LFDcolor, HFDcolor, "black")) +
theme(panel.border = element_blank(), axis.line = element_line()) +
theme(axis.text.x=element_text(angle=45, hjust=1))
ggsave("figures/forestalpha.pdf", device="pdf", height=4, width=6, useDingbats=F)
Nice.Table(AlphaCombined)
lapply(PerStudy, function(x) x$AlphaDiversity) %>%
do.call(bind_rows, .) %>%
filter(!StudyID %in% c("David 2014", "Wu 2011")) %>%
left_join(metadata) %>%
mutate(Diversity=if_else(Metric=="FBratio", log2(Diversity), Diversity)) %>%
group_by(StudyID, P_Fat, Metric, Diet_Classification) %>%
summarize(mean=mean(Diversity), sd=sd(Diversity)) %>%
ungroup() %>%
mutate(Metric=factor(Metric, levels=c("FBratio","Chao1","Shannon","FaithsPD"))) %>%
ggplot(aes(x=P_Fat, y=mean, ymin=mean-sd, ymax=mean+sd, fill=Diet_Classification)) +
geom_smooth(fill="grey80", color="grey50") +
geom_errorbar(width=0) +
geom_point(shape=21) +
facet_wrap(~Metric, scales="free", nrow=1) +
theme_MicrobeR() +
xlab("% Fat") +
ylab("log2(HFD/LFD)±SD") +
scale_fill_manual(values=c(HFDcolor, LFDcolor)) +
theme(legend.position = "none")
ggsave("figures/alphacor.pdf", device="pdf", height=3, width=8, useDingbats=F)
lapply(PerStudy, function(x) x$AlphaDiversity) %>%
do.call(bind_rows, .) %>%
filter(!StudyID %in% c("David 2014", "Wu 2011")) %>%
left_join(metadata) %>%
mutate(Diversity=if_else(Metric=="FBratio", log2(Diversity), Diversity)) %>%
group_by(StudyID, P_Fat, Metric, Diet_Classification) %>%
summarize(mean=mean(Diversity), sd=sd(Diversity)) %>%
group_by(Metric) %>%
do(
broom::tidy(
cor.test(.$mean,.$P_Fat, method="spearman")
)
) %>%
Nice.Table()
While alpha diversity is not correlated with the percentage of dietary fat, the F/B ratio is.
lapply(names(PerStudy), function(x) PerStudy[[x]]$ADONISs %>% mutate(StudyID=x)) %>%
do.call(bind_rows, .) %>%
filter(!StudyID %in% c("David 2014", "Wu 2011", "Anhe 2015")) %>%
bind_rows(ADONIS %>% mutate(StudyID="Combined")) %>%
filter(Term=="Diet_Classification") %>%
ungroup() %>%
mutate(Metric=gsub("Euclidean","Euclidian", Metric)) %>%
mutate(Metric=factor(Metric, levels=c("Bray Curtis","weighted UniFrac", "unweighted UniFrac","Jensen-Shannon divergence","CLR Euclidian","PhILR Euclidian"))) %>%
left_join(NSamples) %>%
mutate(Sig=if_else(Pvalue<0.05, "*","ns")) %>%
ggplot(aes(x=R2, y=Study, color=Metric)) +
geom_point(shape=1, alpha=0.5) +
geom_point(aes(alpha=Sig), shape=16) +
theme_MicrobeR() +
theme(panel.border = element_blank(), axis.line = element_line()) +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
scale_alpha_manual(values=c(0.5,0))
ggsave("figures/forestbeta.pdf", device="pdf", height=4, width=4, useDingbats=F)
lapply(names(PerStudy), function(x){
lapply(names(PerStudy[[x]]$PCoAs), function(y){
PerStudy[[x]]$PCoAs[[y]]$vectors %>%
as.data.frame() %>%
rownames_to_column("SampleID") %>%
mutate(Metric=y) %>%
select(Metric, SampleID, Axis.1, Axis.2, Axis.3)
}) %>%
do.call(bind_rows, .) %>%
mutate(StudyID=x) %>%
select(StudyID, everything())
}) %>%
do.call(bind_rows, .) %>%
left_join(metadata[,c("SampleID","Diet_Classification")]) %>%
ggplot(aes(x=Axis.1, y=Axis.2, color=Diet_Classification)) +
geom_point(alpha=0.5, shape=16) +
facet_wrap(StudyID~Metric, scales="free", ncol=6) +
theme_MicrobeR() +
theme(axis.ticks = element_blank(), axis.text = element_blank()) +
scale_color_manual(values=c(HFDcolor, LFDcolor)) +
theme(legend.position="none") +
xlab("PC1") + ylab("PC2") +
theme(panel.spacing=unit(0 , "lines"))
ggsave("figures/PerStudyPCoA.pdf", device="pdf", height=40, width=10, useDingbats=F)
save.image(paste0("RDS/Session",format(Sys.time(),"%d%b%Y_%H%M%S"),".Rdata"))
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 6880224 367.5 19331441 1032.5 NA 19331441 1032.5
## Vcells 152244108 1161.6 424227929 3236.7 16384 1680471828 12821.0