This document is otherwise identical to HFD_diversity.Rmd but starts by importing a version of the OTU table that lacks Lactococcus OTUs.

1 Set up and environment

library(tidyverse)
library(readxl)

library(MicrobeR)
library(vegan)
library(phyloseq)
library(philr)

library(doParallel)
library(UpSetR)
library(grid)
library(ggtern)
library(ggtree)
library(ape)
library(picante)
library(randomForest)
library(ROCR)

NSLOTS=6
registerDoParallel(cores=NSLOTS)
knitr::opts_chunk$set(echo = TRUE, message=FALSE, warning=FALSE, tidy=FALSE, cache=FALSE)
HFDcolor="#E69F00"
LFDcolor="#0072B2"
sessionInfo()
## R version 3.5.3 (2019-03-11)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.4
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] grid      parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ROCR_1.0-7          gplots_3.0.1        randomForest_4.6-14
##  [4] picante_1.7         nlme_3.1-137        ape_5.2            
##  [7] ggtree_1.12.0       ggtern_3.0.0        UpSetR_1.4.0       
## [10] doParallel_1.0.11   iterators_1.0.9     foreach_1.4.4      
## [13] philr_1.6.0         phyloseq_1.24.0     vegan_2.5-2        
## [16] lattice_0.20-38     permute_0.9-4       MicrobeR_0.3.2     
## [19] readxl_1.1.0        forcats_0.3.0       stringr_1.3.1      
## [22] dplyr_0.7.5         purrr_0.2.5         readr_1.1.1        
## [25] tidyr_0.8.1         tibble_1.4.2        ggplot2_3.0.0      
## [28] tidyverse_1.2.1    
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_1.3-2    rprojroot_1.3-2     XVector_0.20.0     
##   [4] rstudioapi_0.7      DT_0.4              bit64_0.9-7        
##   [7] lubridate_1.7.4     xml2_1.2.0          codetools_0.2-16   
##  [10] splines_3.5.3       mnormt_1.5-5        robustbase_0.93-2  
##  [13] knitr_1.20          ade4_1.7-11         jsonlite_1.5       
##  [16] broom_0.4.4         cluster_2.0.7-1     latex2exp_0.4.0    
##  [19] compiler_3.5.3      httr_1.3.1          rvcheck_0.1.0      
##  [22] backports_1.1.2     assertthat_0.2.0    Matrix_1.2-15      
##  [25] lazyeval_0.2.1      cli_1.0.0           htmltools_0.3.6    
##  [28] tools_3.5.3         bindrcpp_0.2.2      igraph_1.2.1       
##  [31] gtable_0.2.0        glue_1.2.0          reshape2_1.4.3     
##  [34] fastmatch_1.1-0     Rcpp_0.12.19        Biobase_2.40.0     
##  [37] cellranger_1.1.0    Biostrings_2.48.0   zCompositions_1.1.1
##  [40] multtest_2.36.0     gdata_2.18.0        DECIPHER_2.8.1     
##  [43] psych_1.8.4         tensorA_0.36.1      proto_1.0.0        
##  [46] rvest_0.3.2         phangorn_2.4.0      gtools_3.5.0       
##  [49] DEoptimR_1.0-8      zlibbioc_1.26.0     MASS_7.3-51.1      
##  [52] scales_0.5.0        rtk_0.2.5.7         hms_0.4.2          
##  [55] biomformat_1.10.1   rhdf5_2.24.0        yaml_2.2.0         
##  [58] memoise_1.1.0       gridExtra_2.3       NADA_1.6-1         
##  [61] stringi_1.2.3       RSQLite_2.1.1       S4Vectors_0.18.3   
##  [64] energy_1.7-5        tidytree_0.1.9      caTools_1.17.1     
##  [67] BiocGenerics_0.26.0 boot_1.3-20         truncnorm_1.0-8    
##  [70] bitops_1.0-6        compositions_1.40-2 rlang_0.2.1        
##  [73] pkgconfig_2.0.1     evaluate_0.10.1     Rhdf5lib_1.2.1     
##  [76] bindr_0.1.1         treeio_1.4.1        htmlwidgets_1.2    
##  [79] bit_1.1-14          tidyselect_0.2.4    plyr_1.8.4         
##  [82] magrittr_1.5        R6_2.2.2            IRanges_2.14.10    
##  [85] DBI_1.0.0           pillar_1.2.3        haven_1.1.1        
##  [88] foreign_0.8-71      withr_2.1.2         mgcv_1.8-27        
##  [91] survival_2.43-3     bayesm_3.1-0.1      modelr_0.1.2       
##  [94] crayon_1.3.4        KernSmooth_2.23-15  plotly_4.7.1       
##  [97] rmarkdown_1.10      data.table_1.11.4   blob_1.1.1         
## [100] digest_0.6.15       stats4_3.5.3        munsell_0.5.0      
## [103] viridisLite_0.3.0   quadprog_1.5-5

2 Data Import and normalization

Will generate multiple versions of the OTU table in a list which will include a filtered (remove noisy features), 0-replaced, CLR-normalized, subsampled, multiple-subsampled, subsampled to 5000 and samples with less reads removed, and PhILR-normalized versions.

metadata<-read_tsv("/Volumes/turnbaughlab/qb3share/jbisanz/HFD_metastudy/CollatedData/metadata_filtered.tsv") %>% mutate(SampleID=DB_Sample) %>% as.data.frame() 
rownames(metadata)<-metadata$DB_Sample

taxonomy<-read_tsv("/Volumes/turnbaughlab/qb3share/jbisanz/HFD_metastudy/dbs/gg_13_8_otus/taxonomy/97_otu_taxonomy.txt", col_names = c("OTU","Taxonomy")) %>%
  separate(Taxonomy, 
           c("Kingdom",
             "Phylum",
             "Class",
             "Order",
             "Family",
             "Genus",
             "Species"
             ), 
           sep="; ", 
           remove=FALSE) %>%
  as.data.frame()
rownames(taxonomy)<-taxonomy$OTU

OTUs<-list()
OTUs$Raw<-read.table("/Volumes/turnbaughlab/qb3share/jbisanz/HFD_metastudy/CollatedData/OTUs_nol.tsv", header=T, sep='\t', row.names=1)
taxonomy<-subset(taxonomy, OTU %in% rownames(OTUs$Raw))

OTUs$Filtered<-Confidence.Filter(OTUs$Raw, 3, 10, TRUE)
OTUs$CZM<-zCompositions::cmultRepl(t(OTUs$Filtered), 
                                   label=0, 
                                   method="CZM", 
                                   output="counts",
                                   suppress.print=TRUE) %>%
                                   t() #with a non-prior based 0 replacement
OTUs$CLR<-apply(log2(OTUs$CZM), 2, function(x)x-mean(x))
OTUs$Subsampled<-Subsample.Table(OTUs$Filtered, VERBOSE=T)
OTUs$min5kSubsampled<-OTUs$Filtered[,colSums(OTUs$Filtered)>=5000]
dim(OTUs$min5kSubsampled)
## [1] 14589   923
OTUs$min5kSubsampled<-Subsample.Table(OTUs$Filtered, VERBOSE=T, DEPTH=5000)
OTUs$multiSubsampled<-Multiple.Subsample.Table(FEATURES=OTUs$Filtered, VERBOSE=T, THREADS = NSLOTS, NSAMPS = 51)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.00   64.00   84.00   77.27   92.00  138.00
trees<-list()
trees$Raw<-read_tree_greengenes("../../dbs/gg_13_8_otus/trees/97_otus.tree")
trees$Raw$edge.length[is.nan(trees$Raw$edge.length)]<-0 # workaround for import error on a single edge https://forum.qiime2.org/t/greengenes-tree-download-with-branch-lengths/3329/5 a la qiime1
trees$Raw<-drop.tip(trees$Raw, tip=trees$Raw$tip.label[!trees$Raw$tip.label %in% rownames(OTUs$Raw)])
trees$PhILR<-drop.tip(trees$Raw, tip=trees$Raw$tip.label[!trees$Raw$tip.label %in% rownames(OTUs$Filtered)])
trees$PhILR<-makeNodeLabel(trees$PhILR, method="number", prefix="n")
trees$Subsampled<-drop.tip(trees$Raw, tip=trees$Raw$tip.label[!trees$Raw$tip.label %in% rownames(OTUs$Subsampled)])
trees$min5kSubsampled<-drop.tip(trees$Raw, tip=trees$Raw$tip.label[!trees$Raw$tip.label %in% rownames(OTUs$min5kSubsampled)])
trees$multiSubsampled<-drop.tip(trees$Raw, tip=trees$Raw$tip.label[!trees$Raw$tip.label %in% rownames(OTUs$multiSubsampled)])

OTUs$PhILR<-t(philr(t(OTUs$CZM), trees$PhILR, part.weights='enorm.x.gm.counts', ilr.weights='blw.sqrt'))

saveRDS(OTUs, "RDS/OTUs.RDS")
saveRDS(trees, "RDS/trees.RDS")
saveRDS(taxonomy, "RDS/taxonomy.RDS")
gc()
##             used  (Mb) gc trigger    (Mb) limit (Mb)   max used    (Mb)
## Ncells   4502332 240.5    7266798   388.1         NA    7266798   388.1
## Vcells 109680061 836.8 1348380823 10287.4      16384 1680471828 12821.0

3 High level visualizations

3.1 Phylum barplots

OTUs$TaxaSummary<-Summarize.Taxa(OTUs$Filtered, taxonomy)

phab<-
  OTUs$TaxaSummary$Phylum %>%
  Make.Percent() %>%
  as.data.frame() %>%
  rownames_to_column("Phylum") %>%
  as.tibble() %>%
  mutate(Phylum=gsub("..+p__","", Phylum)) %>%
  gather(-Phylum, key="SampleID", value="Abundance")

tp<-
  phab %>% group_by(Phylum) %>% summarize(mean=mean(Abundance)) %>%
  arrange(desc(mean)) %>%
  top_n(9, mean) %>%
  bind_rows(., tibble(Phylum="Other", mean=0))

por<-
  phab %>%
  filter(Phylum=="Firmicutes") %>%
  arrange(desc(Abundance)) %>%
  pull(SampleID)

phab %>%
  mutate(Phylum=if_else(Phylum %in% tp$Phylum, Phylum, "Other")) %>%
  mutate(Phylum=factor(Phylum, levels = rev(tp$Phylum))) %>%
  left_join(metadata) %>%
  #filter(Colonization=="SPF" | Colonization=="MUSD") %>%
  mutate(SampleID=factor(SampleID, levels=por)) %>%
  ggplot(aes(x=SampleID, y=Abundance, fill=Phylum)) +
  geom_bar(stat="identity") +
  facet_grid(~Diet_Classification, scales="free_x", space = "free") +
  theme_MicrobeR() +
  theme(axis.text.x = element_blank()) +
  theme(axis.ticks.x = element_blank()) +
  coord_cartesian(expand=F) +
  xlab("Sample") +
  ylab("Phylum Abundance (%)") +
  scale_fill_manual(values=rev(c(
    "blue4",
    "olivedrab",
    "firebrick",
    "gold",
    "darkorchid",
    "steelblue2",
    "chartreuse1",
    "aquamarine",
    "coral",
    "grey"
  )))

ggsave("figures/Phylumplot.pdf", device="pdf", height=3, width=6)
rm(phab, tp, por)

3.2 Overlap of OTUs between studies

uplot<-
  OTUs$Raw %>%
  as.data.frame() %>%
  rownames_to_column("OTU") %>%
  as_tibble() %>%
  gather(-OTU, key=SampleID, value=Count) %>%
  mutate(Count=if_else(Count==0, 0, 1)) %>%
  left_join(metadata[,c("SampleID","StudyID")]) %>%
  select(-SampleID) %>%
  group_by(StudyID, OTU) %>%
  summarize(Count=max(Count))
  
uplot<-
  uplot %>%
  spread(key=StudyID, value=Count, fill=0) %>%
  as.data.frame() %>%
  upset(., nsets=30, nintersects=300, order.by="freq", show.numbers=F, matrix.dot.alpha=0)
  
uplot

pdf("figures/upset.pdf", height=11, width=20, useDingbats=F)
  print(uplot)
dev.off()
## quartz_off_screen 
##                 2
rm(uplot)
gc()
##             used  (Mb) gc trigger   (Mb) limit (Mb)   max used    (Mb)
## Ncells   4735475 253.0    7266798  388.1         NA    7266798   388.1
## Vcells 130610151 996.5 1078704658 8229.9      16384 1680471828 12821.0

3.3 Ordinations

3.3.1 Distance Matrix Generation

Then will generate the CLR-euclidian, PhILR-euclidian, weighted and unweighted unifrac, Bray Curtis, and JSD metrics for each of the versions of the OTU tables created above.

DistanceMatrices<-list()
DistanceMatrices$Subsampled<-list()
DistanceMatrices$min5kSubsampled<-list()
DistanceMatrices$multiSubsampled<-list()

DistanceMatrices$Subsampled[["PhILR Euclidian"]]<-dist(t(OTUs$PhILR), method="euclidian"); gc()
##             used   (Mb) gc trigger   (Mb) limit (Mb)   max used    (Mb)
## Ncells   4724445  252.4    7266798  388.1         NA    7266798   388.1
## Vcells 131121183 1000.4  862963726 6583.9      16384 1680471828 12821.0
DistanceMatrices$Subsampled[["CLR Euclidian"]]<-dist(t(OTUs$CLR), method="euclidian"); gc()
##             used   (Mb) gc trigger   (Mb) limit (Mb)   max used    (Mb)
## Ncells   4724460  252.4    7266798  388.1         NA    7266798   388.1
## Vcells 131726752 1005.0  690370980 5267.2      16384 1680471828 12821.0
DistanceMatrices$Subsampled[["Bray Curtis"]]<-vegdist(t(Make.Proportion(OTUs$Subsampled)), method="bray"); gc()
##             used   (Mb) gc trigger   (Mb) limit (Mb)   max used    (Mb)
## Ncells   4724726  252.4    7266798  388.1         NA    7266798   388.1
## Vcells 132333861 1009.7  552296784 4213.7      16384 1680471828 12821.0
DistanceMatrices$Subsampled[["Jensen-Shannon Divergence"]]<-phyloseq::distance(phyloseq(otu_table(Make.Proportion(OTUs$Subsampled), taxa_are_rows = T)), method="jsd", parallel=T); gc()
##             used   (Mb) gc trigger   (Mb) limit (Mb)   max used    (Mb)
## Ncells   6548797  349.8   13956171  745.4         NA   13497839   720.9
## Vcells 134429255 1025.7  441837427 3371.0      16384 1680471828 12821.0
DistanceMatrices$Subsampled[["weighted UniFrac"]]<-UniFrac(phyloseq(otu_table(Make.Proportion(OTUs$Subsampled), taxa_are_rows = T), phy=trees$Subsampled), weighted=T, parallel=T); gc()
##             used   (Mb) gc trigger   (Mb) limit (Mb)   max used    (Mb)
## Ncells   6550814  349.9   13956171  745.4         NA   13956171   745.4
## Vcells 135040008 1030.3  441837427 3371.0      16384 1680471828 12821.0
DistanceMatrices$Subsampled[["unweighted UniFrac"]]<-UniFrac(phyloseq(otu_table(Make.Proportion(OTUs$Subsampled), taxa_are_rows = T), phy=trees$Subsampled), weighted=F, parallel=T); gc()
##             used   (Mb) gc trigger   (Mb) limit (Mb)   max used    (Mb)
## Ncells   6550833  349.9   13956171  745.4         NA   13956171   745.4
## Vcells 135646694 1035.0  441837427 3371.0      16384 1680471828 12821.0
DistanceMatrices$min5kSubsampled[["PhILR Euclidian"]]<-dist(t(OTUs$PhILR[,colnames(OTUs$min5kSubsampled)]), method="euclidian"); gc()
##             used   (Mb) gc trigger   (Mb) limit (Mb)   max used    (Mb)
## Ncells   6550882  349.9   13956171  745.4         NA   13956171   745.4
## Vcells 136073168 1038.2  441837427 3371.0      16384 1680471828 12821.0
DistanceMatrices$min5kSubsampled[["CLR Euclidian"]]<-dist(t(OTUs$CLR[,colnames(OTUs$min5kSubsampled)]), method="euclidian"); gc()
##             used   (Mb) gc trigger   (Mb) limit (Mb)   max used    (Mb)
## Ncells   6550919  349.9   13956171  745.4         NA   13956171   745.4
## Vcells 136499633 1041.5  441837427 3371.0      16384 1680471828 12821.0
DistanceMatrices$min5kSubsampled[["Bray Curtis"]]<-vegdist(t(Make.Proportion(OTUs$min5kSubsampled)), method="bray"); gc()
##             used   (Mb) gc trigger   (Mb) limit (Mb)   max used    (Mb)
## Ncells   6550949  349.9   13956171  745.4         NA   13956171   745.4
## Vcells 136926100 1044.7  441837427 3371.0      16384 1680471828 12821.0
DistanceMatrices$min5kSubsampled[["Jensen-Shannon Divergence"]]<-phyloseq::distance(phyloseq(otu_table(Make.Proportion(OTUs$min5kSubsampled), taxa_are_rows = T)), method="jsd", parallel=T); gc()
##             used   (Mb) gc trigger   (Mb) limit (Mb)   max used    (Mb)
## Ncells   6550965  349.9   13956171  745.4         NA   13956171   745.4
## Vcells 137352556 1048.0  441837427 3371.0      16384 1680471828 12821.0
DistanceMatrices$min5kSubsampled[["weighted UniFrac"]]<-UniFrac(phyloseq(otu_table(Make.Proportion(OTUs$min5kSubsampled), taxa_are_rows = T), phy=trees$min5kSubsampled), weighted=T, parallel=T); gc()
##             used   (Mb) gc trigger   (Mb) limit (Mb)   max used    (Mb)
## Ncells   6550984  349.9   13956171  745.4         NA   13956171   745.4
## Vcells 137779025 1051.2  441837427 3371.0      16384 1680471828 12821.0
DistanceMatrices$min5kSubsampled[["unweighted UniFrac"]]<-UniFrac(phyloseq(otu_table(Make.Proportion(OTUs$min5kSubsampled), taxa_are_rows = T), phy=trees$min5kSubsampled), weighted=F, parallel=T); gc()
##             used   (Mb) gc trigger   (Mb) limit (Mb)   max used    (Mb)
## Ncells   6551003  349.9   13956171  745.4         NA   13956171   745.4
## Vcells 138205486 1054.5  441837427 3371.0      16384 1680471828 12821.0
DistanceMatrices$multiSubsampled[["PhILR Euclidian"]]<-dist(t(OTUs$PhILR[,colnames(OTUs$multiSubsampled)]), method="euclidian"); gc()
##             used   (Mb) gc trigger   (Mb) limit (Mb)   max used    (Mb)
## Ncells   6551051  349.9   13956171  745.4         NA   13956171   745.4
## Vcells 138812185 1059.1  441837427 3371.0      16384 1680471828 12821.0
DistanceMatrices$multiSubsampled[["CLR Euclidian"]]<-dist(t(OTUs$CLR[,colnames(OTUs$multiSubsampled)]), method="euclidian"); gc()
##             used   (Mb) gc trigger   (Mb) limit (Mb)   max used    (Mb)
## Ncells   6551088  349.9   13956171  745.4         NA   13956171   745.4
## Vcells 139418875 1063.7  441837427 3371.0      16384 1680471828 12821.0
DistanceMatrices$multiSubsampled[["Bray Curtis"]]<-vegdist(t(Make.Proportion(OTUs$multiSubsampled)), method="bray"); gc()
##             used   (Mb) gc trigger   (Mb) limit (Mb)   max used    (Mb)
## Ncells   6551117  349.9   13956171  745.4         NA   13956171   745.4
## Vcells 140024466 1068.4  441837427 3371.0      16384 1680471828 12821.0
DistanceMatrices$multiSubsampled[["Jensen-Shannon Divergence"]]<-phyloseq::distance(phyloseq(otu_table(Make.Proportion(OTUs$multiSubsampled), taxa_are_rows = T)), method="jsd", parallel=T); gc()
##             used   (Mb) gc trigger   (Mb) limit (Mb)   max used    (Mb)
## Ncells   6551133  349.9   13956172  745.4         NA   13956172   745.4
## Vcells 141155435 1077.0  441837427 3371.0      16384 1680471828 12821.0
DistanceMatrices$multiSubsampled[["weighted UniFrac"]]<-UniFrac(phyloseq(otu_table(Make.Proportion(OTUs$multiSubsampled), taxa_are_rows = T), phy=trees$multiSubsampled), weighted=T, parallel=T); gc()
##             used   (Mb) gc trigger   (Mb) limit (Mb)   max used    (Mb)
## Ncells   6551152  349.9   19331441 1032.5         NA   13956172   745.4
## Vcells 141762129 1081.6  441837427 3371.0      16384 1680471828 12821.0
DistanceMatrices$multiSubsampled[["unweighted UniFrac"]]<-UniFrac(phyloseq(otu_table(Make.Proportion(OTUs$multiSubsampled), taxa_are_rows = T), phy=trees$multiSubsampled), weighted=F, parallel=T); gc()
##             used   (Mb) gc trigger   (Mb) limit (Mb)   max used    (Mb)
## Ncells   6551171  349.9   19331441 1032.5         NA   19331441  1032.5
## Vcells 142368815 1086.2  441837427 3371.0      16384 1680471828 12821.0
saveRDS(DistanceMatrices, "RDS/DistanceMatrices.RDS")

3.3.2 Distance distribution

alldists<-
lapply(names(DistanceMatrices$multiSubsampled), function(dname){
  DistanceMatrices$multiSubsampled[[dname]] %>%
  as.matrix() %>% 
  as.data.frame() %>%
  rownames_to_column("Subject") %>%
  gather(-Subject, key="Match", value=Distance) %>%
  mutate(Metric=dname)
  }) %>%
  do.call(bind_rows, .)

alldists %>%
  mutate(Metric=factor(Metric, levels=c("PhILR Euclidian", "unweighted UniFrac","weighted UniFrac", "CLR Euclidian","Bray Curtis","Jensen-Shannon Divergence"))) %>%
  ggplot(aes(x=Distance, color=Metric)) +
  geom_freqpoly() +
  theme_MicrobeR() +
  theme(legend.position="none") +
  facet_wrap(~Metric, scales="free")

ggsave("figures/disthist.pdf", device="pdf", width=7, height=5, useDingbats=F)

rm(alldists)
lapply(names(DistanceMatrices$multiSubsampled), function(x){
  DistanceMatrices$multiSubsampled[[x]] %>%
    as.vector() %>%
    summary() %>%
    broom::tidy() %>%
    mutate(Metric=x) %>%
    select(Metric, everything())
}) %>%
  do.call(bind_rows, .) %>%
  Nice.Table()

Because the non-phylogeneticly weighted metrics are so highly saturated in completely non-overlapping samples, will remove them from analysis of the all-sample analysis.

for (x in names(DistanceMatrices)){
  DistanceMatrices[[x]]$`CLR Euclidian`<-NULL
  DistanceMatrices[[x]]$`Bray Curtis`<-NULL
  DistanceMatrices[[x]]$`Jensen-Shannon Divergence`<-NULL
  }

3.3.3 Ordinations

Here only ordinating mouse samples that are from SPF mice, or gnotobiotic mice colonized with mouse communities.

mousesamps<-metadata %>% filter(Colonization %in% c("SPF")) %>% pull(SampleID)
PCoAs<-list()
for (y in names(DistanceMatrices)){
  for(x in names(DistanceMatrices[[y]])){
    tm<-mousesamps[mousesamps %in% colnames(as.matrix(DistanceMatrices[[y]][[x]]))]
      message(x,"-", y)
    PCoAs[[paste0(x,"-",y)]]<-
      as.matrix(DistanceMatrices[[y]][[x]])[tm,tm] %>%
      as.dist(.) %>%
      ape::pcoa()
  }
}
rm(tm, y, x)

3.3.3.1 Variation Explained

lapply(names(PCoAs), function(x){
  PCoAs[[x]]$values %>%
    as.data.frame() %>%
    rownames_to_column("Axis") %>%
    as_tibble() %>%
    mutate(Metric=x) %>%
    mutate(Axis=as.numeric(Axis)) %>%
    filter(Axis<=10)
}) %>%
  do.call(bind_rows, .) %>%
  mutate(Pvar=100*Relative_eig) %>%
  separate(Metric, c("Metric","Dataset"), sep="-") %>%
  ggplot(aes(x=Axis, y=Pvar, color=Metric)) +
  geom_line() +
  theme_MicrobeR() +
  ylab("% variation explained") +
  xlab("PC") +
  scale_x_continuous(breaks=1:10) +
  facet_wrap(~Dataset)

ggsave("figures/scree.pdf", device="pdf", width=9, height=3, useDingbats=F)
lapply(names(PCoAs), function(x){
  PCoAs[[x]]$values %>%
    as.data.frame() %>%
    rownames_to_column("Axis") %>%
    as_tibble() %>%
    mutate(Metric=x) %>%
    mutate(Axis=as.numeric(Axis)) %>%
    filter(Axis<=10)
}) %>%
  do.call(bind_rows, .) %>%
  select(Metric, everything()) %>%
  separate(Metric, c("Metric","Dataset"), sep="-") %>%
  Nice.Table(.)

3.3.3.2 Plot by Diet type

plotorder<-c("unweighted UniFrac","weighted UniFrac","PhILR Euclidian")

lapply(names(PCoAs), function(x) PCoAs[[x]]$vectors %>% as.data.frame() %>% rownames_to_column("SampleID") %>% select(SampleID, Axis.1, Axis.2, Axis.3) %>% mutate(Metric=x)) %>%
  do.call(bind_rows, .) %>%
  as_tibble() %>%
  left_join(metadata) %>%
  separate(Metric, c("Metric","Dataset"), sep="-") %>%
  mutate(Metric=factor(Metric, levels=plotorder)) %>%
  ggplot(aes(x=Axis.1, y=Axis.2)) +
  geom_point(aes(color=Diet_Classification), shape=16, alpha=0.3) +
  scale_color_manual(values=c(HFDcolor, LFDcolor)) +
  theme_MicrobeR() +
  facet_wrap(Dataset~Metric, scales="free", ncol=3) +
  theme(panel.border = element_blank(), axis.line = element_line()) +
  theme(axis.text = element_blank(), axis.ticks = element_blank()) +
  xlab("") +
  ylab("") +
  theme(legend.position = "none")

ggsave("figures/PCoAs_diet.pdf", device="pdf", height=7.5, width=6, useDingbats=F)
lapply(names(PCoAs), function(x) PCoAs[[x]]$vectors %>% as.data.frame() %>% rownames_to_column("SampleID") %>% select(SampleID, Axis.1, Axis.2, Axis.3) %>% mutate(Metric=x)) %>%
  do.call(bind_rows, .) %>%
  as_tibble() %>%
  left_join(metadata) %>%
  separate(Metric, c("Metric","Dataset"), sep="-") %>%
  mutate(Metric=factor(Metric, levels=plotorder)) %>%
  ggplot(aes(x=Axis.2, y=Axis.3)) +
  geom_point(aes(color=Diet_Classification), shape=16, alpha=0.3) +
  scale_color_manual(values=c(HFDcolor, LFDcolor)) +
  theme_MicrobeR() +
  facet_wrap(Dataset~Metric, scales="free", ncol=3) +
  theme(panel.border = element_blank(), axis.line = element_line()) +
  theme(axis.text = element_blank(), axis.ticks = element_blank()) +
  xlab("") +
  ylab("") +
  theme(legend.position = "none")

ggsave("figures/PCoAs_diet_2v3.pdf", device="pdf", height=7.5, width=6, useDingbats=F)

3.3.3.3 Plot by Study

ps<-
  lapply(names(PCoAs), function(x) PCoAs[[x]]$vectors %>% as.data.frame() %>% rownames_to_column("SampleID") %>% select(SampleID, Axis.1, Axis.2, Axis.3) %>% mutate(Metric=x)) %>%
  do.call(bind_rows, .) %>%
  as_tibble() %>%
  left_join(metadata) %>%
    separate(Metric, c("Metric","Dataset"), sep="-") %>%
  mutate(Metric=factor(Metric, levels=plotorder)) %>%
  ggplot(aes(x=Axis.1, y=Axis.2)) +
  geom_point(aes(color=StudyID), shape=16, alpha=0.4) +
  theme_MicrobeR() +
  facet_wrap(Dataset~Metric, scales="free", ncol=3) +
  theme(panel.border = element_blank(), axis.line = element_line()) +
  theme(axis.text = element_blank(), axis.ticks = element_blank()) +
  xlab("") +
  ylab("") +
  theme(legend.position = "none")
ps

ggsave("figures/PCoAs_study.pdf", device="pdf", height=7.5, width=6, useDingbats=F)
ps<-
  lapply(names(PCoAs), function(x) PCoAs[[x]]$vectors %>% as.data.frame() %>% rownames_to_column("SampleID") %>% select(SampleID, Axis.1, Axis.2, Axis.3) %>% mutate(Metric=x)) %>%
  do.call(bind_rows, .) %>%
  as_tibble() %>%
  left_join(metadata) %>%
  separate(Metric, c("Metric","Dataset"), sep="-") %>%
  mutate(Metric=factor(Metric, levels=plotorder)) %>%
  ggplot(aes(x=Axis.2, y=Axis.3)) +
  geom_point(aes(color=StudyID), shape=16, alpha=0.4) +
  theme_MicrobeR() +
  facet_wrap(Dataset~Metric, scales="free", ncol=3) +
  theme(panel.border = element_blank(), axis.line = element_line()) +
  theme(axis.text = element_blank(), axis.ticks = element_blank()) +
  xlab("PC2") +
  ylab("PC3") +
  theme(legend.position = "none")
ps

ggsave("figures/PCoAs_study_2v3.pdf", device="pdf", height=7.5, width=6, useDingbats=F)
ps<-ps + theme(legend.position="right")
ps<-cowplot::get_legend(ps)
plot(ps)

ggsave("figures/PCoAs_study_legend.pdf", ps, device="pdf", height=5, width=3, useDingbats=F)

3.3.4 ADONIS

  ADONIS<-tibble()
  for(x in names(DistanceMatrices)){
    for(y in names(DistanceMatrices[[x]])){
    message(x,"-",y)
    ts<-mousesamps[mousesamps %in% labels(DistanceMatrices[[x]][[y]])]
    tm<-metadata %>% as.data.frame() %>% remove_rownames() %>% filter(SampleID %in% ts) %>% column_to_rownames("SampleID") 
    td<-as.matrix(DistanceMatrices[[x]][[y]])[ts,ts] %>% as.dist()
    tm<-tm[labels(td),]
    ta<-adonis(td~Diet_Classification+StudyID, data=tm, strata=tm$StudyID, permutations=999, parallel=NSLOTS)
      message("ADONIS complete for ",x)
    ta<-ta$aov.tab %>% 
      as.data.frame() %>% 
      rownames_to_column("Term") %>%
      mutate(Metric=paste0(x,"-",y)) %>% 
      select(Metric, everything()) %>%
      rename(Pvalue=`Pr(>F)`)
    ADONIS<-bind_rows(ADONIS, ta)
    rm(td, tm, ta, ts)
    gc()
    }
  }
  saveRDS(ADONIS, "RDS/ADONIS.RDS")
ADONIS<-ADONIS %>% separate(Metric, c("Data","Metric"), sep="-")
write_tsv(ADONIS, "figures/ADONIS_wstrata.txt")
Nice.Table(ADONIS)

3.4 Conclusions from high level visualizations

Based on the data generated above, it appears clear that there is some effect of dietary fat on microbiota composition. This can be observed in the PCoA visualizations and the apparent differences in the Firmicutes content seen in the boxplots. These comparison inevitably required a high level of subsampling, and as such further analysis will focus on within study normalization.

rm(DistanceMatrices, PCoAs, ps, plotorder, x, mousesamps)
gc()
##             used   (Mb) gc trigger   (Mb) limit (Mb)   max used    (Mb)
## Ncells   6645867  355.0   19331441 1032.5         NA   19331441  1032.5
## Vcells 132040063 1007.4  441837427 3371.0      16384 1680471828 12821.0

4 Per Study Analysis

4.1 processStudy function

To conduct this analysis, will use one function to analyze each study using the same approach but with study-appropriate normalization

processStudy<-function(Study){
# PS is the per study object which is a named list with the following objects:
  # Study
  # Metadata
  # AlphaDiversity
  # AlphaDiversity_Stats
  # DistanceMatrices
  # PCoAs
  # ADONISs
  # FBratio
  # OTUs_raw
  # OTUs_subsampled
  # OTUs_clr
  # OTUs_PhILR
  # OTUs_filtered
  # Tree_GG
  # Tree_PhILR
  # Subsample_depth
 PS<-list()
  PS$Study<-Study
  PS$Metadata<-subset(metadata, StudyID==Study)
  PS$OTUs_raw<-OTUs$Raw[,PS$Metadata$SampleID]
  PS$OTUs_raw<-PS$OTUs_raw[rowSums(PS$OTUs_raw)>=1,]
  message(Study, " table has dimensions:", paste(dim(PS$OTUs_raw), collapse="x"))

  
  PS$Subsample_depth<-min(colSums(PS$OTUs_raw))
  PS$OTUs_subsampled<-Subsample.Table(PS$OTUs_raw, DEPTH = PS$Subsample_depth, VERBOSE = T)
  PS$OTUs_filtered<-Confidence.Filter(PS$OTUs_raw, 3, 10)
  
  PS$Tree_GG<-drop.tip(trees$Raw, trees$Raw$tip.label[!trees$Raw$tip.label %in% rownames(PS$OTUs_subsampled)])

  message("FBratioing")
  PS$FBratio<-
    Summarize.Taxa(PS$OTUs_raw, taxonomy %>% as.data.frame() %>% remove_rownames() %>% column_to_rownames("OTU"))$Phylum %>%
    Make.Percent() %>%
    t() %>%
    as.data.frame() %>%
    rownames_to_column("SampleID") %>%
    mutate(FBratio=(`k__Bacteria;p__Firmicutes`+0.1)/(`k__Bacteria;p__Bacteroidetes`+0.1)) %>%
    select(SampleID, FBratio)

  message("Alpha diversity")

  PS$AlphaDiversity<-data.frame(
    Shannon=vegan::diversity(PS$OTUs_subsampled, index="shannon", MARGIN=2),
    Chao1=vegan::estimateR(t(PS$OTUs_subsampled)) %>% t() %>% as.data.frame %>% rename(Chao1=S.chao1) %>% pull(Chao1),
    FaithsPD=picante::pd(t(PS$OTUs_subsampled), PS$Tree_GG, include.root=F)$PD
  )  %>% rownames_to_column("SampleID") %>% 
    select(SampleID, everything()) %>% 
    left_join(PS$FBratio) %>%
    as_tibble() %>%
    gather(-SampleID, key="Metric", value="Diversity") %>%
    left_join(PS$Metadata[,c("SampleID","Diet_Classification", "StudyID")]) %>%
    select(SampleID, StudyID, Diet_Classification, Metric, Diversity)
  
  PS$AlphaDiversity <-
    PS$AlphaDiversity %>%
    left_join(
      PS$AlphaDiversity %>%
      group_by(Metric, Diet_Classification) %>%
      summarize(mean=mean(log2(Diversity))) %>%
      spread(key=Diet_Classification, value=mean) %>%
      rename(mean_log2HFD=HFD, mean_log2LFD=LFD)
    ) %>%
    mutate(log2FC=log2(Diversity)-mean_log2LFD)

  PS$AlphaDiversity_Stats <-
    PS$AlphaDiversity %>%
    group_by(Metric) %>% 
    do(
      broom::tidy(t.test(log2FC~Diet_Classification, data=., conf.int=TRUE, conf.level=0.95))
    ) %>%
    mutate(StudyID=Study) %>%
    select(StudyID, Metric, log2FC=estimate, Pvalue=p.value, mean_LFD=estimate2, mean_HFD=estimate1, CI_low=conf.low, CI_high=conf.high)

  PS$OTUs_clr<-Make.CLR(PS$OTUs_filtered, CZM=TRUE)
  
  PS$DistanceMatrices<-list()
  message("PhILR Euclidian")
  
  PS$Tree_PhILR<-drop.tip(trees$Raw, trees$Raw$tip.label[!trees$Raw$tip.label %in% rownames(PS$OTUs_filtered)])
  PS$Tree_PhILR<-makeNodeLabel(PS$Tree_PhILR, method="number", prefix="n")
  PS$OTUs_PhILR<-t(philr(t(PS$OTUs_filtered+0.5), PS$Tree_PhILR, part.weights='enorm.x.gm.counts', ilr.weights='blw.sqrt'))

  PS$DistanceMatrices[["PhILR Euclidian"]]<-dist(t(PS$OTUs_PhILR), method="euclidian")
    message("CLR Euclidian")
  PS$DistanceMatrices[["CLR Euclidian"]]<-dist(t(PS$OTUs_clr), method="euclidian")
    message("Bray Curtis")
  PS$DistanceMatrices[["Bray Curtis"]]<-vegdist(t(Make.Proportion(PS$OTUs_subsampled)), method="bray")
    message("Wunifrac")
  PS$DistanceMatrices[["weighted UniFrac"]]<-UniFrac(phyloseq(otu_table(Make.Proportion(PS$OTUs_subsampled), taxa_are_rows = T), phy=PS$Tree_GG), weighted=T, parallel=T)
    message("UWunifrac")
  PS$DistanceMatrices[["unweighted UniFrac"]]<-UniFrac(phyloseq(otu_table(Make.Proportion(PS$OTUs_subsampled), taxa_are_rows = T), phy=PS$Tree_GG), weighted=F, parallel=T)
    message("JSD")
  PS$DistanceMatrices[["Jensen-Shannon divergence"]]<-phyloseq::distance(phyloseq(otu_table(Make.Proportion(PS$OTUs_subsampled), taxa_are_rows = T)), method="jsd", parallel=T)
  
  message("PCoAing")
  PS$PCoAs<-lapply(PS$DistanceMatrices, pcoa)
  
  message("ADONISing")

  PS$ADONISs<-lapply(PS$DistanceMatrices, function(x) adonis(x~PS$Metadata[match(labels(x), PS$Metadata$SampleID),]$Diet_Classification))

  PS$ADONISs<-
    lapply(names(PS$ADONISs), function(x){
    PS$ADONISs[[x]]$aov.tab %>% 
      as.data.frame() %>%
      rownames_to_column("Term") %>%
      as.tibble() %>%
      mutate(Metric=x) %>%
      select(Metric, Term, R2, Pvalue=`Pr(>F)`) %>%
      mutate(Term=if_else(Term=="PS$Metadata[match(labels(x), PS$Metadata$SampleID), ]$Diet_Classification", "Diet_Classification", Term))
      }) %>%
    do.call(bind_rows,.)
    gc()
  return(PS)
}

4.2 Process Studies

  PerStudy<-list()
  for(Study in (unique(metadata$StudyID))){
    message("--------------------->", Study)
    PerStudy[[Study]]<-processStudy(Study)
  }
  saveRDS(PerStudy, "RDS/PerStudy.RDS")

4.3 Forest Plot

In the forest plot I will also include all studies pooled. Due to differences in baseline, I will use the log2 fold changes to pool studies and calculate the summed effect.

4.3.1 Alpha div metrics

mod<-lapply(PerStudy, function(x) x$AlphaDiversity) %>%
  do.call(bind_rows, .) %>%
  mutate(Diet_Classification=factor(Diet_Classification, levels=c("LFD","HFD"))) %>%
  filter(!StudyID %in% c("David 2014", "Wu 2011")) %>% #remove human samples
  group_by(Metric)

AlphaCombined<-tibble(StudyID=character(0), Metric=character(0), log2FC=numeric(0), Pvalue=numeric(0), mean_LFD=numeric(0), mean_HFD=numeric(0), CI_low=numeric(0), CI_high=numeric(0))
for(i in unique(mod$Metric)){
  fit<-lmerTest:::lmer(log2FC~Diet_Classification+(1|StudyID), data=subset(mod, Metric==i))
  cf<-confint(fit,level = 0.95)
  
  AlphaCombined<-bind_rows(AlphaCombined, tibble(
    StudyID="Combined", 
    Metric=i, 
    log2FC=summary(fit)$coefficients["Diet_ClassificationHFD", "Estimate"], 
    Pvalue=anova(fit)$`Pr(>F)`, 
    mean_LFD=NA, 
    mean_HFD=NA, 
    CI_low=cf["Diet_ClassificationHFD",1], 
    CI_high=cf["Diet_ClassificationHFD",2]
  ))
}

NSamples<-
  lapply(names(PerStudy), function(x) tibble(StudyID=x, Nsamples=length(unique(PerStudy[[x]]$AlphaDiversity$SampleID)))) %>% 
  do.call(bind_rows, .) %>% 
  arrange(desc(Nsamples)) %>% 
  mutate(Study=paste0(StudyID, " (n=", Nsamples,")")) %>% 
  bind_rows(tibble(StudyID="Combined", Study="Combined")) %>%
  mutate(Study=factor(Study, levels=rev(Study)))


lapply(PerStudy, function(x) x$AlphaDiversity_Stats) %>%
  do.call(bind_rows, .) %>%
    filter(!StudyID %in% c("David 2014", "Wu 2011", "Anhe 2015")) %>% #Anhe 2015 removed as no replicates so any comparison is not really appropriate
  bind_rows(AlphaCombined) %>%
  mutate(Significant=case_when(
    Pvalue<0.05 & log2FC>0 ~ "* up",
    Pvalue<0.05 & log2FC<0 ~ "* down",
    TRUE~"ns"
  )) %>%
  ungroup() %>%
  mutate(Metric=factor(Metric, levels=c("Chao1","Shannon", "FaithsPD","FBratio"))) %>%
  left_join(NSamples) %>%
  ggplot(aes(x=log2FC, y=Study, color=Significant)) +
  geom_vline(xintercept = 0, linetype="dashed", color="grey50") +
  geom_errorbarh(aes(xmin=CI_low, xmax=CI_high), height=0 ) +
  geom_point() +
  facet_grid(~Metric, scales="free_x") +
  theme_MicrobeR() +
  scale_color_manual(values=c(LFDcolor, HFDcolor, "black"))  +
  theme(panel.border = element_blank(), axis.line = element_line()) +
  theme(axis.text.x=element_text(angle=45, hjust=1))

ggsave("figures/forestalpha.pdf", device="pdf", height=4, width=6, useDingbats=F)

4.3.1.1 Combined Alpha Diversity Stats

Nice.Table(AlphaCombined)

4.3.2 Check for a correlation between Alpha diversity and fat content

lapply(PerStudy, function(x) x$AlphaDiversity) %>%
  do.call(bind_rows, .) %>%
  filter(!StudyID %in% c("David 2014", "Wu 2011")) %>%
  left_join(metadata) %>%
  mutate(Diversity=if_else(Metric=="FBratio", log2(Diversity), Diversity)) %>%
  group_by(StudyID, P_Fat, Metric, Diet_Classification) %>%
  summarize(mean=mean(Diversity), sd=sd(Diversity)) %>%
  ungroup() %>%
  mutate(Metric=factor(Metric, levels=c("FBratio","Chao1","Shannon","FaithsPD"))) %>%
  ggplot(aes(x=P_Fat, y=mean, ymin=mean-sd, ymax=mean+sd, fill=Diet_Classification)) +
  geom_smooth(fill="grey80", color="grey50") +
  geom_errorbar(width=0) +
  geom_point(shape=21) +
  facet_wrap(~Metric, scales="free", nrow=1) +
  theme_MicrobeR() +
  xlab("% Fat") +
  ylab("log2(HFD/LFD)±SD") +
  scale_fill_manual(values=c(HFDcolor, LFDcolor)) +
  theme(legend.position = "none")

ggsave("figures/alphacor.pdf", device="pdf", height=3, width=8, useDingbats=F)
lapply(PerStudy, function(x) x$AlphaDiversity) %>%
  do.call(bind_rows, .) %>%
  filter(!StudyID %in% c("David 2014", "Wu 2011")) %>%
  left_join(metadata) %>%
  mutate(Diversity=if_else(Metric=="FBratio", log2(Diversity), Diversity)) %>%
  group_by(StudyID, P_Fat, Metric, Diet_Classification) %>%
  summarize(mean=mean(Diversity), sd=sd(Diversity)) %>%
  group_by(Metric) %>%
  do(
    broom::tidy(
      cor.test(.$mean,.$P_Fat, method="spearman")
    )
  ) %>% 
  Nice.Table()

While alpha diversity is not correlated with the percentage of dietary fat, the F/B ratio is.


4.3.3 Bdiv metrics

lapply(names(PerStudy), function(x) PerStudy[[x]]$ADONISs %>% mutate(StudyID=x)) %>%
  do.call(bind_rows, .) %>%
    filter(!StudyID %in% c("David 2014", "Wu 2011", "Anhe 2015")) %>%
  bind_rows(ADONIS %>% mutate(StudyID="Combined")) %>%
  filter(Term=="Diet_Classification") %>%
  ungroup() %>%
  mutate(Metric=gsub("Euclidean","Euclidian", Metric)) %>%
  mutate(Metric=factor(Metric, levels=c("Bray Curtis","weighted UniFrac", "unweighted UniFrac","Jensen-Shannon divergence","CLR Euclidian","PhILR Euclidian"))) %>%
  left_join(NSamples) %>%
  mutate(Sig=if_else(Pvalue<0.05, "*","ns")) %>%
  ggplot(aes(x=R2, y=Study, color=Metric)) +
  geom_point(shape=1, alpha=0.5) +
  geom_point(aes(alpha=Sig), shape=16) +
  theme_MicrobeR() +
  theme(panel.border = element_blank(), axis.line = element_line()) +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  scale_alpha_manual(values=c(0.5,0))

ggsave("figures/forestbeta.pdf", device="pdf", height=4, width=4, useDingbats=F)

4.3.4 Ordination separately by study

lapply(names(PerStudy), function(x){
  lapply(names(PerStudy[[x]]$PCoAs), function(y){
    PerStudy[[x]]$PCoAs[[y]]$vectors %>% 
      as.data.frame() %>% 
      rownames_to_column("SampleID") %>% 
      mutate(Metric=y) %>%
      select(Metric, SampleID, Axis.1, Axis.2, Axis.3)
  }) %>%
    do.call(bind_rows, .) %>%
    mutate(StudyID=x) %>%
    select(StudyID, everything())
}) %>%
  do.call(bind_rows, .) %>%
  left_join(metadata[,c("SampleID","Diet_Classification")]) %>%
  ggplot(aes(x=Axis.1, y=Axis.2, color=Diet_Classification)) +
  geom_point(alpha=0.5, shape=16) +
  facet_wrap(StudyID~Metric, scales="free", ncol=6) +
  theme_MicrobeR() +
  theme(axis.ticks = element_blank(), axis.text = element_blank()) +
  scale_color_manual(values=c(HFDcolor, LFDcolor)) +
  theme(legend.position="none") +
  xlab("PC1") + ylab("PC2") +
  theme(panel.spacing=unit(0 , "lines"))

ggsave("figures/PerStudyPCoA.pdf", device="pdf", height=40, width=10, useDingbats=F)
save.image(paste0("RDS/Session",format(Sys.time(),"%d%b%Y_%H%M%S"),".Rdata"))
gc()
##             used   (Mb) gc trigger   (Mb) limit (Mb)   max used    (Mb)
## Ncells   6880224  367.5   19331441 1032.5         NA   19331441  1032.5
## Vcells 152244108 1161.6  424227929 3236.7      16384 1680471828 12821.0

Return to main page