1 Set up and environment

library(Biostrings)
library(tidyverse)
library(readxl)

library(MicrobeR)
library(philr)

library(ggtree)
library(ape)
library(randomForest)
library(ROCR)

knitr::opts_chunk$set(echo = TRUE, message=FALSE, warning=FALSE, tidy=FALSE, cache=FALSE)
HFDcolor="#E69F00"
LFDcolor="#0072B2"
sessionInfo()
## R version 3.5.3 (2019-03-11)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.4
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ROCR_1.0-7          gplots_3.0.1        randomForest_4.6-14
##  [4] ape_5.2             ggtree_1.12.0       philr_1.6.0        
##  [7] MicrobeR_0.3.2      readxl_1.1.0        forcats_0.3.0      
## [10] stringr_1.3.1       dplyr_0.7.5         purrr_0.2.5        
## [13] readr_1.1.1         tidyr_0.8.1         tibble_1.4.2       
## [16] ggplot2_3.0.0       tidyverse_1.2.1     Biostrings_2.48.0  
## [19] XVector_0.20.0      IRanges_2.14.10     S4Vectors_0.18.3   
## [22] BiocGenerics_0.26.0
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.3-2    rprojroot_1.3-2     rstudioapi_0.7     
##  [4] DT_0.4              bit64_0.9-7         lubridate_1.7.4    
##  [7] xml2_1.2.0          codetools_0.2-16    splines_3.5.3      
## [10] mnormt_1.5-5        knitr_1.20          ade4_1.7-11        
## [13] jsonlite_1.5        phyloseq_1.24.0     broom_0.4.4        
## [16] cluster_2.0.7-1     compiler_3.5.3      httr_1.3.1         
## [19] rvcheck_0.1.0       backports_1.1.2     assertthat_0.2.0   
## [22] Matrix_1.2-15       lazyeval_0.2.1      cli_1.0.0          
## [25] htmltools_0.3.6     tools_3.5.3         bindrcpp_0.2.2     
## [28] igraph_1.2.1        gtable_0.2.0        glue_1.2.0         
## [31] reshape2_1.4.3      fastmatch_1.1-0     Rcpp_0.12.19       
## [34] Biobase_2.40.0      cellranger_1.1.0    zCompositions_1.1.1
## [37] multtest_2.36.0     gdata_2.18.0        nlme_3.1-137       
## [40] DECIPHER_2.8.1      iterators_1.0.9     psych_1.8.4        
## [43] rvest_0.3.2         phangorn_2.4.0      gtools_3.5.0       
## [46] zlibbioc_1.26.0     MASS_7.3-51.1       scales_0.5.0       
## [49] rtk_0.2.5.7         hms_0.4.2           biomformat_1.10.1  
## [52] rhdf5_2.24.0        yaml_2.2.0          memoise_1.1.0      
## [55] NADA_1.6-1          stringi_1.2.3       RSQLite_2.1.1      
## [58] foreach_1.4.4       tidytree_0.1.9      permute_0.9-4      
## [61] caTools_1.17.1      truncnorm_1.0-8     bitops_1.0-6       
## [64] rlang_0.2.1         pkgconfig_2.0.1     evaluate_0.10.1    
## [67] lattice_0.20-38     Rhdf5lib_1.2.1      bindr_0.1.1        
## [70] treeio_1.4.1        htmlwidgets_1.2     bit_1.1-14         
## [73] tidyselect_0.2.4    plyr_1.8.4          magrittr_1.5       
## [76] R6_2.2.2            picante_1.7         DBI_1.0.0          
## [79] pillar_1.2.3        haven_1.1.1         foreign_0.8-71     
## [82] withr_2.1.2         mgcv_1.8-27         survival_2.43-3    
## [85] modelr_0.1.2        crayon_1.3.4        KernSmooth_2.23-15 
## [88] plotly_4.7.1        rmarkdown_1.10      grid_3.5.3         
## [91] data.table_1.11.4   blob_1.1.1          vegan_2.5-2        
## [94] digest_0.6.15       munsell_0.5.0       viridisLite_0.3.0  
## [97] quadprog_1.5-5
dir.create("figures")
dir.create("RDS")

2 Data Import

Reusing data normalized from HFD_features.html and HFD_features_nol.html

RFdata<-readRDS("../Features/RDS/RFdata.RDS")
  names(RFdata)<-paste0("l_", names(RFdata))
RFdata_nol<-readRDS("../Features_nol/RDS/RFdata.RDS")
  names(RFdata_nol)<-paste0("nol_", names(RFdata_nol))
RFdata<-c(RFdata, RFdata_nol)
rm(RFdata_nol)
  
metadata<-readRDS("../Features/RDS/RFmetadata.RDS") %>% mutate(Task=if_else(grepl("Murine", Task), "Murine", Task))

3 Leave-one-dataset-out

mousestudies<-metadata %>% filter(Colonization %in% c("SPF","MUSD")) %>% group_by(StudyID) %>% summarize(Nsamp=n()) %>% arrange((Nsamp))

Predictions<-tibble()
AUCs<-tibble()

for(VSTUDY in mousestudies$StudyID){
  tmpmeta<-metadata %>% mutate(Task=if_else(StudyID==VSTUDY, "LODO", Task))
  
  for(x in names(RFdata)){
  message(date(), " Running->", VSTUDY," ", x)

    fit<-randomForest(
            x=t(RFdata[[x]][,subset(tmpmeta, Task=="Murine")$SampleID]),
            y=subset(tmpmeta, Task=="Murine")$Diet_Classification,
            importance=TRUE
      )
      for(i in unique(tmpmeta$Task)){
          message(x, " ", i)
          tm<-
          predict(fit, newdata=t(RFdata[[x]][,subset(tmpmeta, Task==i)$SampleID]), type="prob")[,2] %>%
          prediction(., subset(tmpmeta, Task==i)$Diet_Classification) %>%
          performance(., "tpr","fpr")
          
          Predictions<-bind_rows(Predictions, tibble(LODO=VSTUDY, Data=x, Task=i, FPR=unlist(tm@x.values), TPR=unlist(tm@y.values)))
          
          auc<-predict(fit, newdata=t(RFdata[[x]][,subset(tmpmeta, Task==i)$SampleID]), type="prob")[,2] %>%
          prediction(., subset(tmpmeta, Task==i)$Diet_Classification) %>%
          performance(., "auc")
          
          AUCs<-bind_rows(AUCs, tibble(LODO=VSTUDY, Data=x, Task=i, AUC=auc@y.values[[1]]))
        rm(auc, tm)
          
        }
    rm(fit)
  }
rm(tmpmeta)
}

saveRDS(Predictions, "RDS/Predictions.RDS")
saveRDS(AUCs, "RDS/AUCs.RDS")

4 ROCs

To best represet the ROC curves, will calculate the median and 1st and 3rd quartiles rounded to the nearest 0.05 for plotting purposes.

median_iqr<-function(x){data.frame(y=median(x), ymin=as.numeric(summary(x)[2]), ymax=as.numeric(summary(x)[5]))}

Predictions %>%
  filter(Task!="Murine") %>%
  separate(Data, c("Data","Type"), sep="_") %>%
  mutate(Data=if_else(Data=="l","With Lactococcus","Without Lactococcus")) %>%
  mutate(rFPR=round(FPR/0.05)*0.05) %>%
  mutate(Data=factor(Data, levels=c("With Lactococcus", "Without Lactococcus"))) %>%
  mutate(Task=factor(Task, levels=c("LODO","Humanized Mice","Human"))) %>%
  mutate(Type=factor(Type, levels=c("OTU","KO","PhILR"))) %>%
  ggplot(aes(x=rFPR, y=TPR, fill=Task, color=Task, group=Task)) +
  stat_summary(geom="ribbon", fun.data=median_iqr, alpha=0.5, color=NA)  +
  stat_summary(geom="line", fun.data=median_iqr, alpha=0.5, linetype="dashed") +
  geom_abline(linetype="dashed", color="grey50") +
  facet_grid(Data~Type) +
  theme_MicrobeR()

ggsave("figures/LODO_curves.pdf", height=3, width=6, useDingbats=FALSE, device="pdf")

5 AUCs

Nice.Table(AUCs)
AUCs %>%
  filter(Task=="LODO") %>%
  group_by(Data) %>%
  summarize_at("AUC", funs(median,min,max,IQR, sd)) %>%
  separate(Data, c("Lactococcus","Dataset"), sep="_") %>%
  mutate(Lactococcus=if_else(Lactococcus=="nol", "Without","With")) %>%
  knitr::kable()
Lactococcus Dataset median min max IQR sd
With KO 0.9000000 0.4745370 1 0.3735340 0.1903771
With OTU 0.9333333 0.5555556 1 0.2222222 0.1448362
With PhILR 0.9717029 0.4444444 1 0.2150463 0.1569721
Without KO 0.8166667 0.2590278 1 0.3998595 0.2236134
Without OTU 0.9588816 0.4444444 1 0.3033190 0.1710324
Without PhILR 0.9708514 0.5277778 1 0.2175926 0.1596660
save.image(paste0("RDS/Session",format(Sys.time(),"%d%b%Y_%H%M%S"),".Rdata"))
gc()
##            used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells  4535420 242.3    7736977 413.2         NA  7736977 413.2
## Vcells 87662439 668.9  127780896 974.9      16384 94594228 721.7

Return to main page