1 Set up and environment

library(Biostrings)
library(tidyverse)
library(readxl)

library(MicrobeR)
library(philr)

library(ggtree)
library(ape)
library(randomForest)
library(ROCR)

knitr::opts_chunk$set(echo = TRUE, message=FALSE, warning=FALSE, tidy=FALSE, cache=FALSE)
HFDcolor="#E69F00"
LFDcolor="#0072B2"
sessionInfo()
## R version 3.5.3 (2019-03-11)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.4
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ROCR_1.0-7          gplots_3.0.1        randomForest_4.6-14
##  [4] ape_5.2             ggtree_1.12.0       philr_1.6.0        
##  [7] MicrobeR_0.3.2      readxl_1.1.0        forcats_0.3.0      
## [10] stringr_1.3.1       dplyr_0.7.5         purrr_0.2.5        
## [13] readr_1.1.1         tidyr_0.8.1         tibble_1.4.2       
## [16] ggplot2_3.0.0       tidyverse_1.2.1     Biostrings_2.48.0  
## [19] XVector_0.20.0      IRanges_2.14.10     S4Vectors_0.18.3   
## [22] BiocGenerics_0.26.0
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.3-2    rprojroot_1.3-2     rstudioapi_0.7     
##  [4] DT_0.4              bit64_0.9-7         lubridate_1.7.4    
##  [7] xml2_1.2.0          codetools_0.2-16    splines_3.5.3      
## [10] mnormt_1.5-5        knitr_1.20          ade4_1.7-11        
## [13] jsonlite_1.5        phyloseq_1.24.0     broom_0.4.4        
## [16] cluster_2.0.7-1     compiler_3.5.3      httr_1.3.1         
## [19] rvcheck_0.1.0       backports_1.1.2     assertthat_0.2.0   
## [22] Matrix_1.2-15       lazyeval_0.2.1      cli_1.0.0          
## [25] htmltools_0.3.6     tools_3.5.3         bindrcpp_0.2.2     
## [28] igraph_1.2.1        gtable_0.2.0        glue_1.2.0         
## [31] reshape2_1.4.3      fastmatch_1.1-0     Rcpp_0.12.19       
## [34] Biobase_2.40.0      cellranger_1.1.0    zCompositions_1.1.1
## [37] multtest_2.36.0     gdata_2.18.0        nlme_3.1-137       
## [40] DECIPHER_2.8.1      iterators_1.0.9     psych_1.8.4        
## [43] rvest_0.3.2         phangorn_2.4.0      gtools_3.5.0       
## [46] zlibbioc_1.26.0     MASS_7.3-51.1       scales_0.5.0       
## [49] rtk_0.2.5.7         hms_0.4.2           biomformat_1.10.1  
## [52] rhdf5_2.24.0        yaml_2.2.0          memoise_1.1.0      
## [55] NADA_1.6-1          stringi_1.2.3       RSQLite_2.1.1      
## [58] foreach_1.4.4       tidytree_0.1.9      permute_0.9-4      
## [61] caTools_1.17.1      truncnorm_1.0-8     bitops_1.0-6       
## [64] rlang_0.2.1         pkgconfig_2.0.1     evaluate_0.10.1    
## [67] lattice_0.20-38     Rhdf5lib_1.2.1      bindr_0.1.1        
## [70] treeio_1.4.1        htmlwidgets_1.2     bit_1.1-14         
## [73] tidyselect_0.2.4    plyr_1.8.4          magrittr_1.5       
## [76] R6_2.2.2            picante_1.7         DBI_1.0.0          
## [79] pillar_1.2.3        haven_1.1.1         foreign_0.8-71     
## [82] withr_2.1.2         mgcv_1.8-27         survival_2.43-3    
## [85] modelr_0.1.2        crayon_1.3.4        KernSmooth_2.23-15 
## [88] plotly_4.7.1        rmarkdown_1.10      grid_3.5.3         
## [91] data.table_1.11.4   blob_1.1.1          vegan_2.5-2        
## [94] digest_0.6.15       munsell_0.5.0       viridisLite_0.3.0  
## [97] quadprog_1.5-5

2 Data Import

Here we are reusing data normalized during the diversity analysis (HFD_diversity.html). Using CLR normalized OTUs and PICRUSt KOs, and PhILR.

metadata<-read_tsv("/Volumes/turnbaughlab/qb3share/jbisanz/HFD_metastudy/CollatedData/metadata_filtered.tsv") %>% 
  mutate(SampleID=DB_Sample) %>% 
  mutate(Diet_Classification=factor(Diet_Classification, levels=c("HFD","LFD"))) %>% 
  as.data.frame()
rownames(metadata)<-metadata$DB_Sample

OTUs<-readRDS("/Volumes/turnbaughlab/qb3share/jbisanz/HFD_metastudy/Markdown/Diversity_nol/RDS/OTUs.RDS")
trees<-readRDS("/Volumes/turnbaughlab/qb3share/jbisanz/HFD_metastudy/Markdown/Diversity_nol/RDS/trees.RDS")
tree<-trees$PhILR
rm(trees)
taxonomy<-readRDS("/Volumes/turnbaughlab/qb3share/jbisanz/HFD_metastudy/Markdown/Diversity_nol/RDS/taxonomy.RDS")

Also getting the KEGG orthology data.

KO<-read.table("/Volumes/turnbaughlab/qb3share/jbisanz/HFD_metastudy/CollatedData/PICRUSt_nol.tsv", header=T, row.names=1, sep='\t')
KO<-Make.CLR(KO, PRIOR = 0.5)

Will now build a single object containing the data we want to use for downstream analysis.

RFdata<-list()
RFdata$KO<-KO
RFdata$OTU<-OTUs$CLR
RFdata$PhILR<-OTUs$PhILR
rm(OTUs, KO)
saveRDS(RFdata, "RDS/RFdata.RDS")
gc()
##            used  (Mb) gc trigger   (Mb) limit (Mb)  max used   (Mb)
## Ncells  4363658 233.1    7685835  410.5         NA   6188303  330.5
## Vcells 47730791 364.2  156864731 1196.8      16384 169374346 1292.3

3 Assigning Tasks

Will use a sequential validation approach. Will have 4 groups of studies: * Human Studies (David 2014 and Wu 2011) * Humanized mice (from Goodman 2011 and Turnbaugh 2009) * 3 Randomly selected SPF and MUSD studies to validation across studies (External validation) * 1/3 validation set of SPF and MUSD mice (Internal Validation) * 2/3 training set of SPF and MUSD mice (Training Set)

set.seed(1811)
ValidStudies<-
  metadata %>%
  filter(!StudyID %in% c("David 2014", "Wu 2011","Goodman 2011")) %>%
  pull(StudyID) %>%
  unique() %>%
  sample(., 3)
print(paste("The following studies will act as randomly selected external validation studies:", paste(ValidStudies, collapse=", "), "."))
## [1] "The following studies will act as randomly selected external validation studies: Everard 2014, Xiao 2015, Evans 2014 ."
metadata<-
  metadata %>%
  select(StudyID, SampleID, Species, Background, Colonization, Diet_Classification) %>%
  mutate(Task=case_when(
    StudyID %in% ValidStudies ~ "External Murine Sample",
    Species == "Homo sapiens" ~ "Human",
    Colonization == "HUMD" ~ "Humanized Mice",
    TRUE ~ "Test/Training Set"
  ))

set.seed(1811)
TrainingSet<-
  metadata %>%
    filter(Task=="Test/Training Set") %>%
    pull(SampleID) %>%
    sample(., round(2/3*(length(.)), 0))

metadata<-
  metadata %>%
  mutate(Task=case_when(
    SampleID %in% TrainingSet ~ "Murine Training Set",
    Task == "Test/Training Set" ~ "Murine Test Set",
    TRUE ~ Task
  ))

saveRDS(metadata, "RDS/RFmetadata.RDS")

3.1 Summary of assignments

metadata %>%
  group_by(Task) %>%
  summarize(Nsample=length(SampleID)) %>%
  arrange(desc(Nsample)) %>%
  Nice.Table()

4 10-fold cross validation for feature selection

CV<-list()
  for(x in names(RFdata)){
    message(date(), " ", x)
    CV[[x]]<-
      rfcv(
          trainx=t(RFdata[[x]][,subset(metadata, Task=="Murine Training Set")$SampleID]),
          trainy=subset(metadata, Task=="Murine Training Set")$Diet_Classification,
          cv.fold=10
    )
  saveRDS(CV, "RDS/CVs.RDS")
  }
do.call(rbind, lapply(names(CV), function(x) data.frame(Error=CV[[x]]$error.cv) %>% rownames_to_column("nFeature") %>% mutate(Data=x))) %>%
  as.tibble() %>%
  mutate(nFeature=as.numeric(nFeature)) -> CVerror

ggplot(CVerror, aes(x=nFeature, y=Error, group=Data, color=Data)) +
  geom_line() +
  geom_point() +
  theme_classic() +
  theme(legend.position=c(0.8, 0.8)) +
  xlab("# features") +
  ylab("cross-validation error rate (%)") +
  #scale_x_continuous(breaks=seq(0,15000,2000))
  scale_x_continuous(trans="log2", breaks=2^(0:14)) +
  theme(axis.text.x=element_text(angle=45, hjust=1))

ggsave("figures/cv_curves.pdf", device="pdf", height=4, width=4, useDingbats=F)

4.1 Error by number of features

CVerror %>% Nice.Table()

Based on the results and the saturation that occurs going forward will use the number of features below based on where error rate was saturated in the plots above.

NFeats<-list()
NFeats$OTU<-228
NFeats$KO<-432
NFeats$PhILR<-456

4.2 Informative Features

Will rerun the random forests and extract the N most important features based on the mean decrease in GINI.

FEAT<-list()
  for(x in names(RFdata)){
    message(date(), " ", x)
    FEAT[[x]]<-
      randomForest(
          x=t(RFdata[[x]][,subset(metadata, Task=="Murine Training Set")$SampleID]),
          y=subset(metadata, Task=="Murine Training Set")$Diet_Classification,
          importance=TRUE
    )
  saveRDS(FEAT, "RDS/FEAT.RDS")
  }
Features<-
lapply(names(FEAT), function(x) {
  FEAT[[x]]$importance %>% 
    as.data.frame() %>% 
    rownames_to_column("Feature") %>% 
    arrange(desc(MeanDecreaseGini)) %>% 
    top_n(NFeats[[x]], MeanDecreaseGini) %>%
    mutate(Type=x)
  }) %>%
  do.call(bind_rows, .) %>%
  select(Type, Feature, everything())

Will now annotate these top features.

Annotations<-list()
Annotations$KO<-read_tsv("/Volumes/turnbaughlab/qb3share/jbisanz/HFD_metastudy/PICRUSt/mgm4535747.3_mgm4535747.3.picrust", skip=2, col_names=F) %>% mutate(Type="KO") %>% select(Type, Feature=X1, Annotation=X3)
Annotations$OTU<-taxonomy %>% mutate(Annotation=paste0(OTU, "|", gsub(" ","", Taxonomy))) %>% mutate(Type="OTU") %>% mutate(OTU=as.character(OTU)) %>% select(Type, Feature=OTU, Annotation)
Annotations$PhILR<-lapply(subset(Features, Type=="PhILR")$Feature, function(x) tibble(Type="PhILR", Feature=x, Annotation=name.balance(tr=tree, tax=taxonomy, coord=x))) %>% do.call(bind_rows, .)
Features<-Features %>% left_join(do.call(bind_rows, Annotations))
rm(Annotations)
gc()
##            used  (Mb) gc trigger   (Mb) limit (Mb)  max used   (Mb)
## Ncells  4578410 244.6    7685835  410.5         NA   7685835  410.5
## Vcells 49007385 373.9  156864731 1196.8      16384 169374346 1292.3

Now will add an estimate of differential abundance between them which will be the log fold change (mean HFD - mean LFD).

FCs<-
rbind(RFdata$KO, RFdata$OTU, RFdata$PhILR)[Features$Feature,] %>%
  as.data.frame() %>%
  rownames_to_column("Feature") %>%
  as_tibble() %>%
  gather(-Feature, key=SampleID, value=Abundance) %>%
  left_join(metadata %>% select(SampleID, Task, Diet_Classification)) %>%
  filter(Task=="Murine Training Set") %>%
  group_by(Feature, Diet_Classification) %>%
  summarize(mean=mean(Abundance)) %>%
  spread(key=Diet_Classification, value=mean) %>%
  mutate(logFC=HFD-LFD) %>%
  ungroup() %>%
  select(Feature, HFD_mean=HFD, LFD_mean=LFD, logFC)

Features<-Features %>% as_tibble() %>% left_join(FCs)

And finally will do a spearman correlation between abundance and fat as a continuous variable.

Cors<-
rbind(RFdata$KO, RFdata$OTU, RFdata$PhILR)[Features$Feature,] %>%
  as.data.frame() %>%
  rownames_to_column("Feature") %>%
  as_tibble() %>%
  gather(-Feature, key="SampleID", value="Abundance") %>%
  left_join(read_tsv("/Volumes/turnbaughlab/qb3share/jbisanz/HFD_metastudy/CollatedData/metadata_filtered.tsv") %>% select(SampleID=DB_Sample, P_Fat)) %>%
  filter(!is.na(P_Fat)) %>%
  group_by(Feature) %>%
  do(
    broom::tidy(cor.test(.$P_Fat,.$Abundance, method="spearman"))
  ) %>%
  select(Feature, rho=estimate, Pvalue=p.value) %>%
  ungroup() %>%
  mutate(FDR=p.adjust(Pvalue, method="BH"))

Features<-Features %>% left_join(Cors) %>% as_tibble() %>% select(Type, Feature, HFD_mean, LFD_mean, logFC, MeanDecreaseAccuracy, MeanDecreaseGini, rho, Pvalue, FDR, Annotation)

4.3 Most Informative Features

write_tsv(Features,"figures/MostImportantFeatures.txt")
Nice.Table(Features)

5 Visualizations of most important features

5.1 OTUs

Features %>%
  filter(Type=="OTU") %>%
  top_n(10, MeanDecreaseGini) %>%
  mutate(Annotation=gsub("__"," ", Annotation)) %>%
  mutate(Annotation=factor(Annotation, levels=Annotation)) %>%
  ggplot(aes(x=Annotation, y=MeanDecreaseGini, fill=logFC)) +
  geom_bar(stat="identity", color="black") +
  theme_MicrobeR() +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  scale_fill_gradient2(low=LFDcolor, high=HFDcolor) +
  theme(legend.position=c(0.9,0.7))

ggsave("figures/MostImpOTUs.pdf", device="pdf", height=8, width=6, useDingbats=F)

5.2 PhILR Nodes with OTUs

Here the importance of the node is being denoted by its shading while the OTUs are also being superimposed with their fold change as calculated above.

ptree<-drop.tip(tree, tree$tip.label[!tree$tip.label %in% Features$Feature]) #ptree<-tree #reduced for simplified viewing
plottree<-ggtree(ptree, layout = "fan", color="black") #
plottree<-plottree %<+% (Features %>% select(Feature, everything()))
# annotate a set of nodes of interest
interest<-c("p__Bacteroidetes","p__Firmicutes","g__Lactococcus")
for( i in interest){
  tips<-taxonomy %>% filter(grepl(i, Taxonomy)) %>% filter(OTU %in% ptree$tip.label) %>% pull(OTU)
  nd<-getMRCA(phy=ptree, tip=as.character(tips))
  plottree<- 
    plottree + geom_hilight(
    node=nd,
    fill="grey50",
    alpha=0.4
    ) +
    geom_cladelabel(node=nd, label=i, offset = 0.03)
}

plottree +
  geom_tippoint(aes(size=MeanDecreaseGini, fill=logFC), color="black", shape=21, alpha=0.9) +
  scale_fill_gradient2(low=LFDcolor, high=HFDcolor) +
  geom_nodepoint(aes(size=MeanDecreaseGini)) +
  theme(legend.position="right")

  #geom_nodelab() +



ggsave("figures/rftree.pdf", device="pdf", height=9, width=11, useDingbats=F)

5.3 KOs

enrich<-clusterProfiler::enrichKEGG(Features %>% filter(Type=="KO") %>% pull(Feature), organism="ko", pAdjustMethod="BH")

enrich<-as.data.frame(enrich) %>% 
  as.tibble() %>%
  filter(p.adjust<0.1)
Nice.Table(enrich)

Now we can plot the network.

d<-as.data.frame(enrich)
membership<-strsplit(d$geneID, split="/")
names(membership)<-d$ID
membership<-plyr::ldply(membership, data.frame)
colnames(membership)<-c("Pathway","Feature")
membership<-membership %>% left_join(Features) %>% left_join(d, by=c("Pathway"="ID"))

#http://kateto.net/network-visualization
#For each node, want: ID  \t  FDR \t log2FoldChange
nodes<-Features %>% filter(Type=="KO") %>% select(Feature, MeanDecreaseGini, logFC)
nodes$Type<-"Feature"
nodes<-nodes[nodes$Feature %in% unlist(strsplit(d$geneID, split="/")),]
dd<-d
dd$log2FoldChange<-NA
dd<-dd[,c("Description","log2FoldChange","p.adjust")]
dd$Type<-"Pathway"
colnames(dd)<-colnames(nodes)
nodes<-rbind(nodes, dd)

nodes<-nodes
links<-membership[,c("Description","Feature")]

net <- igraph::graph_from_data_frame(d=links, vertices=nodes, directed=F) 
#plot_network(net) #using phyloseq's code
g<-net
edgeDF <- data.frame(igraph::get.edgelist(g))
    edgeDF$id <- 1:length(edgeDF[, 1])
    vertDF <- igraph::layout.auto(g)
    colnames(vertDF) <- c("x", "y")
    vertDF <- data.frame(value = igraph::get.vertex.attribute(g, "name"),
        vertDF)
    
#extraData = nodes[as.character(vertDF$value), drop = FALSE]
#vertDF <- data.frame(vertDF, extraData)
vertDF<-vertDF %>% left_join(nodes, by=c("value"="Feature"))
    
    graphDF <- merge(reshape2::melt(edgeDF, id = "id"), vertDF, 
        by = "value")
    
    point_size="MeanDecreaseGini"
    color="log2FC"

 # vertDF$FDR.Colonization[vertDF$Type=="Pathway"]=0.001
  #vertDF$log2FoldChange[vertDF$log2FoldChange>6]=6
  #vertDF$log2FoldChange[vertDF$log2FoldChange<(-6)]=-6  
    vertDF<-vertDF %>% mutate(MeanDecreaseGini=if_else(Type=="Pathway", 12.5, MeanDecreaseGini))

    Knet<-
    ggplot() +
    geom_line(data=graphDF, aes(x=x, y=y, group = id), color="grey", alpha=0.4) +
    geom_point(data=vertDF, aes(x,y, fill = logFC, size = log2(MeanDecreaseGini), shape=Type), color="grey80") +
    geom_text(data=subset(vertDF, Type=="Pathway"), aes(x,y, label = value), size = 2, hjust = 0, vjust=1) +
    coord_equal() +
    scale_fill_gradientn(colors=c("cornflowerblue", "white", "indianred", "darkred")) +
    theme_minimal() +
    theme(axis.text.x=element_blank(), axis.text.y=element_blank(),axis.ticks=element_blank()) +
    theme(axis.title.x=element_blank(),axis.title.y=element_blank()) +
    theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank()) +
    scale_shape_manual(values=c(21, 23))

Knet

ggsave("figures/RFpicrust_wolabs.pdf", Knet, device="pdf", width=8, height=8)

Knet + geom_text(data=subset(vertDF, Type=="Feature"), aes(x,y, label = value), size = 2, hjust = 0, vjust=1)

ggsave("figures/RFpicrust_wlabs.pdf", device="pdf", width=8, height=8)

6 Testing models on external data

models<-lapply(RFdata, function(x){
     randomForest(
        x=t(x[rownames(x) %in% Features$Feature,subset(metadata, Task=="Murine Training Set")$SampleID]),
        y=subset(metadata, Task=="Murine Training Set")$Diet_Classification)
})

models
## $KO
## 
## Call:
##  randomForest(x = t(x[rownames(x) %in% Features$Feature, subset(metadata,      Task == "Murine Training Set")$SampleID]), y = subset(metadata,      Task == "Murine Training Set")$Diet_Classification) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 20
## 
##         OOB estimate of  error rate: 8.96%
## Confusion matrix:
##     HFD LFD class.error
## HFD 255  20  0.07272727
## LFD  31 263  0.10544218
## 
## $OTU
## 
## Call:
##  randomForest(x = t(x[rownames(x) %in% Features$Feature, subset(metadata,      Task == "Murine Training Set")$SampleID]), y = subset(metadata,      Task == "Murine Training Set")$Diet_Classification) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 15
## 
##         OOB estimate of  error rate: 10.37%
## Confusion matrix:
##     HFD LFD class.error
## HFD 244  31   0.1127273
## LFD  28 266   0.0952381
## 
## $PhILR
## 
## Call:
##  randomForest(x = t(x[rownames(x) %in% Features$Feature, subset(metadata,      Task == "Murine Training Set")$SampleID]), y = subset(metadata,      Task == "Murine Training Set")$Diet_Classification) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 21
## 
##         OOB estimate of  error rate: 9.49%
## Confusion matrix:
##     HFD LFD class.error
## HFD 251  24  0.08727273
## LFD  30 264  0.10204082

In addition, will create a simple logistic model based solely on the F/B ratio.

FB<-Summarize.Taxa(
    readRDS("/Volumes/turnbaughlab/qb3share/jbisanz/HFD_metastudy/Markdown/Diversity_nol/RDS/OTUs.RDS")$Filtered,
    readRDS("/Volumes/turnbaughlab/qb3share/jbisanz/HFD_metastudy/Markdown/Diversity_nol/RDS/taxonomy.RDS")
  )$Phylum + 0.5

RFdata$FBratio<-
  FB %>%
  as.data.frame() %>%
  Make.Percent() %>%
  as.data.frame() %>%
  rownames_to_column("Phylum") %>%
  as.tibble() %>%
  filter(grepl("Firmicutes|Bacteroidetes", Phylum)) %>%
  gather(-Phylum, key=Sample, value=abundance) %>%
  spread(key=Phylum, value=abundance) %>%
  mutate(log2FB=log2(`k__Bacteria;p__Firmicutes`/`k__Bacteria;p__Bacteroidetes`)) %>%
  select(SampleID=Sample, log2FB) %>%
  left_join(metadata)
models$FBratio<-glm(Diet_Classification~log2FB, data=subset(RFdata$FBratio, Task=="Murine Training Set"), family=binomial(link="logit"))
models$FBratio
## 
## Call:  glm(formula = Diet_Classification ~ log2FB, family = binomial(link = "logit"), 
##     data = subset(RFdata$FBratio, Task == "Murine Training Set"))
## 
## Coefficients:
## (Intercept)       log2FB  
##      0.5811      -0.5332  
## 
## Degrees of Freedom: 568 Total (i.e. Null);  567 Residual
## Null Deviance:       788.2 
## Residual Deviance: 655.3     AIC: 659.3
summary(models$FBratio)
## 
## Call:
## glm(formula = Diet_Classification ~ log2FB, family = binomial(link = "logit"), 
##     data = subset(RFdata$FBratio, Task == "Murine Training Set"))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0292  -1.0210   0.4185   0.9749   2.2708  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.58110    0.10673   5.444  5.2e-08 ***
## log2FB      -0.53324    0.05681  -9.386  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 788.17  on 568  degrees of freedom
## Residual deviance: 655.31  on 567  degrees of freedom
## AIC: 659.31
## 
## Number of Fisher Scoring iterations: 4
saveRDS(models, "RDS/models.RDS")
Predictions<-list()
AUCs<-list()
for(Tasks in unique(metadata$Task)){
  Predictions[[Tasks]]<-list()
  AUCs[[Tasks]]<-list()
  for(Types in c("PhILR","KO","OTU","FBratio")){
      message(Tasks,"->",Types)
    if(Types!="FBratio"){
    tm<-predict(models[[Types]], newdata=t(RFdata[[Types]][,subset(metadata, Task==Tasks)$SampleID]), type="prob")[,2] %>%
        prediction(., subset(metadata, Task==Tasks)$Diet_Classification) %>%
        performance(., "tpr","fpr")
    Predictions[[Tasks]][[Types]]<-
        tibble(Task=Tasks, Type=Types, FPR=unlist(tm@x.values), TPR=unlist(tm@y.values))
    
    auc<-predict(models[[Types]], newdata=t(RFdata[[Types]][,subset(metadata, Task==Tasks)$SampleID]), type="prob")[,2] %>%
        prediction(., subset(metadata, Task==Tasks)$Diet_Classification) %>%
        performance(., "auc")
    
      AUCs[[Tasks]][[Types]]<-
        tibble(
          Task=Tasks,
          Type=Types,
          AUC=auc@y.values[[1]]
        )
    } else {
      #for FBratio only
      tm<-predict(object=models[[Types]], newdata=subset(RFdata[[Types]], Task==Tasks ), type="response") %>% # must select the appropriate samples
      prediction(., subset(RFdata[[Types]], Task==Tasks )$Diet_Classification) %>%
      performance(., "tpr","fpr")
    
     Predictions[[Tasks]][[Types]]<-
        tibble(Task=Tasks, Type=Types, FPR=unlist(tm@x.values), TPR=unlist(tm@y.values))
    
    auc<-predict(object=models[[Types]], newdata=subset(RFdata[[Types]], Task==Tasks ), type="response") %>% # must select the appropriate samples
      prediction(., subset(RFdata[[Types]], Task==Tasks )$Diet_Classification) %>%
      performance(., "auc")
    
      AUCs[[Tasks]][[Types]]<-
        tibble(
          Task=Tasks,
          Type=Types,
          AUC=auc@y.values[[1]]
        )
    }
  }
}



Predictions<-lapply(Predictions, function(x) do.call(bind_rows, x)) %>% do.call(bind_rows, .)
AUCs<-lapply(AUCs, function(x) do.call(bind_rows, x)) %>% do.call(bind_rows, .)

6.0.1 AUCs

Nice.Table(AUCs %>% spread(key=Type, value=AUC))
write_tsv(AUCs,"figures/AUCs.txt")

6.0.2 ROCs

Predictions %>%
  mutate(Type=factor(Type, levels=c("FBratio", "OTU","PhILR","KO"))) %>%
  mutate(Task=factor(Task, levels=c("Murine Training Set","Murine Test Set","External Murine Sample","Humanized Mice","Human"))) %>%
ggplot(aes(x=FPR, y=TPR, color=Task, group=Task)) +
        geom_abline(color="grey", linetype="dashed") +
        geom_line() +
        xlab("false positive") +
        ylab("true positive") +
        theme_MicrobeR() +
        facet_grid(~Type) +
        theme(axis.text.x=element_text(angle=45, hjust=1)) +
        coord_cartesian(expand=F)

ggsave("figures/ROCs.pdf", device="pdf", height=2, width=6, useDingbats=F)
AUCs %>%
  mutate(Type=factor(Type, levels=c("OTU","PhILR","FBratio","KO"))) %>%
  mutate(Task=factor(Task, levels=c("Murine Training Set","Murine Test Set","External Murine Sample","Humanized Mice","Human"))) %>%
  ggplot(aes(x=Task, y=AUC, group=Type, fill=Type)) +
  geom_bar(stat="identity", position=position_dodge(), color="black") +
  theme_MicrobeR() +
  coord_cartesian(ylim=c(0.4,1.02), expand=0) +
  scale_fill_viridis_d()

ggsave("figures/Auc_plots.pdf", device="pdf", height=2, width=6, useDingbats=F)

6.1 Prediction performance on a per study basis

#Now Task is really coding for Study
Predictions<-list()
AUCs<-list()
for(Tasks in unique(metadata$StudyID)[!unique(metadata$StudyID) %in% c("Anhe 2015","Howe 2016","Hu 2015","Ruan 2016","Lu 2017")]){ #must remove small studies with single samples in the test sets or not enough samples for meaningful number
  Predictions[[Tasks]]<-list()
  AUCs[[Tasks]]<-list()
  for(Types in c("PhILR","KO","OTU","FBratio")){
      message(Tasks,"->",Types)
    if(Types!="FBratio"){
    tm<-predict(models[[Types]], newdata=t(RFdata[[Types]][,subset(metadata, StudyID==Tasks & Task!="Murine Training Set")$SampleID]), type="prob")[,2] %>%
        prediction(., subset(metadata, StudyID==Tasks & Task!="Murine Training Set" )$Diet_Classification) %>%
        performance(., "tpr","fpr")
    Predictions[[Tasks]][[Types]]<-
        tibble(Task=Tasks, Type=Types, FPR=unlist(tm@x.values), TPR=unlist(tm@y.values))
    
    auc<-predict(models[[Types]], newdata=t(RFdata[[Types]][,subset(metadata, StudyID==Tasks & Task!="Murine Training Set")$SampleID]), type="prob")[,2] %>%
        prediction(., subset(metadata, StudyID==Tasks & Task!="Murine Training Set")$Diet_Classification) %>%
        performance(., "auc")
    
      AUCs[[Tasks]][[Types]]<-
        tibble(
          Task=Tasks,
          Type=Types,
          AUC=auc@y.values[[1]]
        )
    } else {
      #for FBratio only
      tm<-predict(object=models[[Types]], newdata=subset(RFdata[[Types]], StudyID==Tasks & Task!="Murine Training Set"), type="response") %>% # must select the appropriate samples
      prediction(., subset(RFdata[[Types]], StudyID==Tasks & Task!="Murine Training Set")$Diet_Classification) %>%
      performance(., "tpr","fpr")
    
     Predictions[[Tasks]][[Types]]<-
        tibble(Task=Tasks, Type=Types, FPR=unlist(tm@x.values), TPR=unlist(tm@y.values))
    
    auc<-predict(object=models[[Types]], newdata=subset(RFdata[[Types]], StudyID==Tasks & Task!="Murine Training Set"), type="response") %>% # must select the appropriate samples
      prediction(., subset(RFdata[[Types]], StudyID==Tasks & Task!="Murine Training Set")$Diet_Classification) %>%
      performance(., "auc")
    
      AUCs[[Tasks]][[Types]]<-
        tibble(
          Task=Tasks,
          Type=Types,
          AUC=auc@y.values[[1]]
        )
    }
  }
}



Predictions<-lapply(Predictions, function(x) do.call(bind_rows, x)) %>% do.call(bind_rows, .)
AUCs<-lapply(AUCs, function(x) do.call(bind_rows, x)) %>% do.call(bind_rows, .)

ord<-AUCs %>% spread(key=Type, value=AUC) %>% as.data.frame() %>% column_to_rownames("Task") %>% dist(., "euclidian") %>% hclust()

AUCs %>%
  mutate(Type=factor(Type, levels=c("FBratio","OTU","PhILR","KO"))) %>%
  mutate(Task=factor(Task, levels=ord$labels[ord$order])) %>%
  mutate(AUC=if_else(AUC<0.5, 0.5, AUC)) %>%
  ggplot(aes(y=Task, x=Type, fill=AUC)) +
  geom_tile() +
  scale_fill_viridis_c() +
  theme_MicrobeR() +
  coord_cartesian(expand=0)

ggsave("figures/Auc_heatmap.pdf", device="pdf", height=5, width=3, useDingbats=F)
write_tsv(AUCs,"figures/AUCs_bystudy.txt")

7 Do Humanized Mice Improve prediction of human samples?

For the purposes of this analysis will skip the feature selection and go straight to model and prediction.

models<-lapply(RFdata[1:3], function(x){
     randomForest(
        x=t(x[,subset(metadata, Task=="Humanized Mice")$SampleID]),
        y=subset(metadata, Task=="Humanized Mice")$Diet_Classification)
})
models$FBratio<-glm(Diet_Classification~log2FB, data=subset(RFdata$FBratio, Task=="Humanized Mice"), family=binomial(link="logit"))
summary(models$FBratio)
## 
## Call:
## glm(formula = Diet_Classification ~ log2FB, family = binomial(link = "logit"), 
##     data = subset(RFdata$FBratio, Task == "Humanized Mice"))
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.85810  -0.33724   0.07138   0.34977   1.86213  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)    4.583      1.554   2.950  0.00318 **
## log2FB        -4.556      1.502  -3.033  0.00242 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 63.683  on 45  degrees of freedom
## Residual deviance: 26.466  on 44  degrees of freedom
## AIC: 30.466
## 
## Number of Fisher Scoring iterations: 7
models
## $KO
## 
## Call:
##  randomForest(x = t(x[, subset(metadata, Task == "Humanized Mice")$SampleID]),      y = subset(metadata, Task == "Humanized Mice")$Diet_Classification) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 83
## 
##         OOB estimate of  error rate: 0%
## Confusion matrix:
##     HFD LFD class.error
## HFD  22   0           0
## LFD   0  24           0
## 
## $OTU
## 
## Call:
##  randomForest(x = t(x[, subset(metadata, Task == "Humanized Mice")$SampleID]),      y = subset(metadata, Task == "Humanized Mice")$Diet_Classification) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 120
## 
##         OOB estimate of  error rate: 6.52%
## Confusion matrix:
##     HFD LFD class.error
## HFD  21   1  0.04545455
## LFD   2  22  0.08333333
## 
## $PhILR
## 
## Call:
##  randomForest(x = t(x[, subset(metadata, Task == "Humanized Mice")$SampleID]),      y = subset(metadata, Task == "Humanized Mice")$Diet_Classification) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 120
## 
##         OOB estimate of  error rate: 2.17%
## Confusion matrix:
##     HFD LFD class.error
## HFD  22   0  0.00000000
## LFD   1  23  0.04166667
## 
## $FBratio
## 
## Call:  glm(formula = Diet_Classification ~ log2FB, family = binomial(link = "logit"), 
##     data = subset(RFdata$FBratio, Task == "Humanized Mice"))
## 
## Coefficients:
## (Intercept)       log2FB  
##       4.583       -4.556  
## 
## Degrees of Freedom: 45 Total (i.e. Null);  44 Residual
## Null Deviance:       63.68 
## Residual Deviance: 26.47     AIC: 30.47
Predictions<-list()
AUCs<-list()
for(Tasks in unique(metadata$Task)){
  Predictions[[Tasks]]<-list()
  AUCs[[Tasks]]<-list()
  for(Types in c("PhILR","KO","OTU","FBratio")){
      message(Tasks,"->",Types)
    if(Types!="FBratio"){
    tm<-predict(models[[Types]], newdata=t(RFdata[[Types]][,subset(metadata, Task==Tasks)$SampleID]), type="prob")[,2] %>%
        prediction(., subset(metadata, Task==Tasks)$Diet_Classification) %>%
        performance(., "tpr","fpr")
    Predictions[[Tasks]][[Types]]<-
        tibble(Task=Tasks, Type=Types, FPR=unlist(tm@x.values), TPR=unlist(tm@y.values))
    
    auc<-predict(models[[Types]], newdata=t(RFdata[[Types]][,subset(metadata, Task==Tasks)$SampleID]), type="prob")[,2] %>%
        prediction(., subset(metadata, Task==Tasks)$Diet_Classification) %>%
        performance(., "auc")
    
      AUCs[[Tasks]][[Types]]<-
        tibble(
          Task=Tasks,
          Type=Types,
          AUC=auc@y.values[[1]]
        )
    } else {
      #for FBratio only
      tm<-predict(object=models[[Types]], newdata=subset(RFdata[[Types]], Task==Tasks ), type="response") %>% # must select the appropriate samples
      prediction(., subset(RFdata[[Types]], Task==Tasks )$Diet_Classification) %>%
      performance(., "tpr","fpr")
    
     Predictions[[Tasks]][[Types]]<-
        tibble(Task=Tasks, Type=Types, FPR=unlist(tm@x.values), TPR=unlist(tm@y.values))
    
    auc<-predict(object=models[[Types]], newdata=subset(RFdata[[Types]], Task==Tasks ), type="response") %>% # must select the appropriate samples
      prediction(., subset(RFdata[[Types]], Task==Tasks )$Diet_Classification) %>%
      performance(., "auc")
    
      AUCs[[Tasks]][[Types]]<-
        tibble(
          Task=Tasks,
          Type=Types,
          AUC=auc@y.values[[1]]
        )
    }
  }
}



Predictions<-lapply(Predictions, function(x) do.call(bind_rows, x)) %>% do.call(bind_rows, .)
AUCs<-lapply(AUCs, function(x) do.call(bind_rows, x)) %>% do.call(bind_rows, .)

7.0.1 AUCs

Nice.Table(AUCs %>% spread(key=Type, value=AUC))
write_tsv(AUCs,"figures/AUCs_hum.txt")

7.0.2 ROCs

Predictions %>%
  mutate(Type=factor(Type, levels=c("OTU","PhILR","FBratio","KO"))) %>%
  mutate(Task=factor(Task, levels=c("Murine Training Set","Murine Test Set","External Murine Sample","Humanized Mice","Human"))) %>%
ggplot(aes(x=FPR, y=TPR, color=Task, group=Task)) +
        geom_abline(color="grey", linetype="dashed") +
        geom_line() +
        xlab("false positive") +
        ylab("true positive") +
        theme_MicrobeR() +
        facet_grid(~Type) +
        theme(axis.text.x=element_text(angle=45, hjust=1))

ggsave("figures/ROCs_hum.pdf", device="pdf", height=2, width=6, useDingbats=F)
AUCs %>%
  mutate(Type=factor(Type, levels=c("FBratio","OTU","PhILR","KO"))) %>%
  mutate(Task=factor(Task, levels=c("Murine Training Set","Murine Test Set","External Murine Sample","Humanized Mice","Human"))) %>%
  ggplot(aes(x=Task, y=AUC, group=Type, fill=Type)) +
  geom_bar(stat="identity", position=position_dodge(), color="black") +
  theme_MicrobeR() +
  coord_cartesian(ylim=c(0.4,1.02), expand=0) +
  scale_fill_viridis_d()

ggsave("figures/Auc_plots_hum.pdf", device="pdf", height=2, width=6, useDingbats=F)
save.image(paste0("RDS/Session",format(Sys.time(),"%d%b%Y_%H%M%S"),".Rdata"))
gc()
##            used  (Mb) gc trigger   (Mb) limit (Mb)  max used   (Mb)
## Ncells  5324461 284.4    9807835  523.8         NA   7685835  410.5
## Vcells 51052366 389.5  150654141 1149.4      16384 188317637 1436.8

Return to main page