library(Biostrings)
library(tidyverse)
library(readxl)
library(MicrobeR)
library(philr)
library(ggtree)
library(ape)
library(randomForest)
library(ROCR)
knitr::opts_chunk$set(echo = TRUE, message=FALSE, warning=FALSE, tidy=FALSE, cache=FALSE)
HFDcolor="#E69F00"
LFDcolor="#0072B2"
sessionInfo()
## R version 3.5.3 (2019-03-11)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ROCR_1.0-7 gplots_3.0.1 randomForest_4.6-14
## [4] ape_5.2 ggtree_1.12.0 philr_1.6.0
## [7] MicrobeR_0.3.2 readxl_1.1.0 forcats_0.3.0
## [10] stringr_1.3.1 dplyr_0.7.5 purrr_0.2.5
## [13] readr_1.1.1 tidyr_0.8.1 tibble_1.4.2
## [16] ggplot2_3.0.0 tidyverse_1.2.1 Biostrings_2.48.0
## [19] XVector_0.20.0 IRanges_2.14.10 S4Vectors_0.18.3
## [22] BiocGenerics_0.26.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.3-2 rprojroot_1.3-2 rstudioapi_0.7
## [4] DT_0.4 bit64_0.9-7 lubridate_1.7.4
## [7] xml2_1.2.0 codetools_0.2-16 splines_3.5.3
## [10] mnormt_1.5-5 knitr_1.20 ade4_1.7-11
## [13] jsonlite_1.5 phyloseq_1.24.0 broom_0.4.4
## [16] cluster_2.0.7-1 compiler_3.5.3 httr_1.3.1
## [19] rvcheck_0.1.0 backports_1.1.2 assertthat_0.2.0
## [22] Matrix_1.2-15 lazyeval_0.2.1 cli_1.0.0
## [25] htmltools_0.3.6 tools_3.5.3 bindrcpp_0.2.2
## [28] igraph_1.2.1 gtable_0.2.0 glue_1.2.0
## [31] reshape2_1.4.3 fastmatch_1.1-0 Rcpp_0.12.19
## [34] Biobase_2.40.0 cellranger_1.1.0 zCompositions_1.1.1
## [37] multtest_2.36.0 gdata_2.18.0 nlme_3.1-137
## [40] DECIPHER_2.8.1 iterators_1.0.9 psych_1.8.4
## [43] rvest_0.3.2 phangorn_2.4.0 gtools_3.5.0
## [46] zlibbioc_1.26.0 MASS_7.3-51.1 scales_0.5.0
## [49] rtk_0.2.5.7 hms_0.4.2 biomformat_1.10.1
## [52] rhdf5_2.24.0 yaml_2.2.0 memoise_1.1.0
## [55] NADA_1.6-1 stringi_1.2.3 RSQLite_2.1.1
## [58] foreach_1.4.4 tidytree_0.1.9 permute_0.9-4
## [61] caTools_1.17.1 truncnorm_1.0-8 bitops_1.0-6
## [64] rlang_0.2.1 pkgconfig_2.0.1 evaluate_0.10.1
## [67] lattice_0.20-38 Rhdf5lib_1.2.1 bindr_0.1.1
## [70] treeio_1.4.1 htmlwidgets_1.2 bit_1.1-14
## [73] tidyselect_0.2.4 plyr_1.8.4 magrittr_1.5
## [76] R6_2.2.2 picante_1.7 DBI_1.0.0
## [79] pillar_1.2.3 haven_1.1.1 foreign_0.8-71
## [82] withr_2.1.2 mgcv_1.8-27 survival_2.43-3
## [85] modelr_0.1.2 crayon_1.3.4 KernSmooth_2.23-15
## [88] plotly_4.7.1 rmarkdown_1.10 grid_3.5.3
## [91] data.table_1.11.4 blob_1.1.1 vegan_2.5-2
## [94] digest_0.6.15 munsell_0.5.0 viridisLite_0.3.0
## [97] quadprog_1.5-5
Here we are reusing data normalized during the diversity analysis (HFD_diversity.html). Using CLR normalized OTUs and PICRUSt KOs, and PhILR.
metadata<-read_tsv("/Volumes/turnbaughlab/qb3share/jbisanz/HFD_metastudy/CollatedData/metadata_filtered.tsv") %>%
mutate(SampleID=DB_Sample) %>%
mutate(Diet_Classification=factor(Diet_Classification, levels=c("HFD","LFD"))) %>%
as.data.frame()
rownames(metadata)<-metadata$DB_Sample
OTUs<-readRDS("/Volumes/turnbaughlab/qb3share/jbisanz/HFD_metastudy/Markdown/Diversity/RDS/OTUs.RDS")
trees<-readRDS("/Volumes/turnbaughlab/qb3share/jbisanz/HFD_metastudy/Markdown/Diversity/RDS/trees.RDS")
tree<-trees$PhILR
rm(trees)
taxonomy<-readRDS("/Volumes/turnbaughlab/qb3share/jbisanz/HFD_metastudy/Markdown/Diversity/RDS/taxonomy.RDS")
Also getting the KEGG orthology data.
KO<-read.table("/Volumes/turnbaughlab/qb3share/jbisanz/HFD_metastudy/CollatedData/PICRUSt.tsv", header=T, row.names=1, sep='\t')
KO<-Make.CLR(KO, PRIOR = 0.5)
Will now build a single object containing the data we want to use for downstream analysis.
RFdata<-list()
RFdata$KO<-KO
RFdata$OTU<-OTUs$CLR
RFdata$PhILR<-OTUs$PhILR
rm(OTUs, KO)
saveRDS(RFdata, "RDS/RFdata.RDS")
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4363825 233.1 7686095 410.5 NA 6188436 330.5
## Vcells 47824794 364.9 157258044 1199.8 16384 169784045 1295.4
Will use a sequential validation approach. Will have 4 groups of studies: * Human Studies (David 2014 and Wu 2011) * Humanized mice (from Goodman 2011 and Turnbaugh 2009) * 3 Randomly selected SPF and MUSD studies to validation across studies (External validation) * 1/3 validation set of SPF and MUSD mice (Internal Validation) * 2/3 training set of SPF and MUSD mice (Training Set)
set.seed(1811)
ValidStudies<-
metadata %>%
filter(!StudyID %in% c("David 2014", "Wu 2011","Goodman 2011")) %>%
pull(StudyID) %>%
unique() %>%
sample(., 3)
print(paste("The following studies will act as randomly selected external validation studies:", paste(ValidStudies, collapse=", "), "."))
## [1] "The following studies will act as randomly selected external validation studies: Everard 2014, Xiao 2015, Evans 2014 ."
metadata<-
metadata %>%
select(StudyID, SampleID, Species, Background, Colonization, Diet_Classification) %>%
mutate(Task=case_when(
StudyID %in% ValidStudies ~ "External Murine Sample",
Species == "Homo sapiens" ~ "Human",
Colonization == "HUMD" ~ "Humanized Mice",
TRUE ~ "Test/Training Set"
))
set.seed(1811)
TrainingSet<-
metadata %>%
filter(Task=="Test/Training Set") %>%
pull(SampleID) %>%
sample(., round(2/3*(length(.)), 0))
metadata<-
metadata %>%
mutate(Task=case_when(
SampleID %in% TrainingSet ~ "Murine Training Set",
Task == "Test/Training Set" ~ "Murine Test Set",
TRUE ~ Task
))
saveRDS(metadata, "RDS/RFmetadata.RDS")
metadata %>%
group_by(Task) %>%
summarize(Nsample=length(SampleID)) %>%
arrange(desc(Nsample)) %>%
Nice.Table()
CV<-list()
for(x in names(RFdata)){
message(date(), " ", x)
CV[[x]]<-
rfcv(
trainx=t(RFdata[[x]][,subset(metadata, Task=="Murine Training Set")$SampleID]),
trainy=subset(metadata, Task=="Murine Training Set")$Diet_Classification,
cv.fold=10
)
saveRDS(CV, "RDS/CVs.RDS")
}
do.call(rbind, lapply(names(CV), function(x) data.frame(Error=CV[[x]]$error.cv) %>% rownames_to_column("nFeature") %>% mutate(Data=x))) %>%
as.tibble() %>%
mutate(nFeature=as.numeric(nFeature)) -> CVerror
ggplot(CVerror, aes(x=nFeature, y=Error, group=Data, color=Data)) +
geom_line() +
geom_point() +
theme_classic() +
theme(legend.position=c(0.8, 0.8)) +
xlab("# features") +
ylab("cross-validation error rate (%)") +
#scale_x_continuous(breaks=seq(0,15000,2000))
scale_x_continuous(trans="log2", breaks=2^(0:14)) +
theme(axis.text.x=element_text(angle=45, hjust=1))
ggsave("figures/cv_curves.pdf", device="pdf", height=4, width=4, useDingbats=F)
CVerror %>% Nice.Table()
Based on the results and the saturation that occurs going forward will use the number of features below based on where error rate was saturated in the plots above.
NFeats<-list()
NFeats$OTU<-229
NFeats$KO<-216
NFeats$PhILR<-457
Will rerun the random forests and extract the N most important features based on the mean decrease in GINI.
FEAT<-list()
for(x in names(RFdata)){
message(date(), " ", x)
FEAT[[x]]<-
randomForest(
x=t(RFdata[[x]][,subset(metadata, Task=="Murine Training Set")$SampleID]),
y=subset(metadata, Task=="Murine Training Set")$Diet_Classification,
importance=TRUE
)
saveRDS(FEAT, "RDS/FEAT.RDS")
}
Features<-
lapply(names(FEAT), function(x) {
FEAT[[x]]$importance %>%
as.data.frame() %>%
rownames_to_column("Feature") %>%
arrange(desc(MeanDecreaseGini)) %>%
top_n(NFeats[[x]], MeanDecreaseGini) %>%
mutate(Type=x)
}) %>%
do.call(bind_rows, .) %>%
select(Type, Feature, everything())
Will now annotate these top features.
Annotations<-list()
Annotations$KO<-read_tsv("/Volumes/turnbaughlab/qb3share/jbisanz/HFD_metastudy/PICRUSt/mgm4535747.3_mgm4535747.3.picrust", skip=2, col_names=F) %>% mutate(Type="KO") %>% select(Type, Feature=X1, Annotation=X3)
Annotations$OTU<-taxonomy %>% mutate(Annotation=paste0(OTU, "|", gsub(" ","", Taxonomy))) %>% mutate(Type="OTU") %>% mutate(OTU=as.character(OTU)) %>% select(Type, Feature=OTU, Annotation)
Annotations$PhILR<-lapply(subset(Features, Type=="PhILR")$Feature, function(x) tibble(Type="PhILR", Feature=x, Annotation=name.balance(tr=tree, tax=taxonomy, coord=x))) %>% do.call(bind_rows, .)
Features<-Features %>% left_join(do.call(bind_rows, Annotations))
rm(Annotations)
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4573748 244.3 7686095 410.5 NA 7686095 410.5
## Vcells 49133471 374.9 157258044 1199.8 16384 169784045 1295.4
Now will add an estimate of differential abundance between them which will be the log fold change (mean HFD - mean LFD).
FCs<-
rbind(RFdata$KO, RFdata$OTU, RFdata$PhILR)[Features$Feature,] %>%
as.data.frame() %>%
rownames_to_column("Feature") %>%
as_tibble() %>%
gather(-Feature, key=SampleID, value=Abundance) %>%
left_join(metadata %>% select(SampleID, Task, Diet_Classification)) %>%
filter(Task=="Murine Training Set") %>%
group_by(Feature, Diet_Classification) %>%
summarize(mean=mean(Abundance)) %>%
spread(key=Diet_Classification, value=mean) %>%
mutate(logFC=HFD-LFD) %>%
ungroup() %>%
select(Feature, HFD_mean=HFD, LFD_mean=LFD, logFC)
Features<-Features %>% as_tibble() %>% left_join(FCs)
And finally will do a spearman correlation between abundance and fat as a continuous variable.
Cors<-
rbind(RFdata$KO, RFdata$OTU, RFdata$PhILR)[Features$Feature,] %>%
as.data.frame() %>%
rownames_to_column("Feature") %>%
as_tibble() %>%
gather(-Feature, key="SampleID", value="Abundance") %>%
left_join(read_tsv("/Volumes/turnbaughlab/qb3share/jbisanz/HFD_metastudy/CollatedData/metadata_filtered.tsv") %>% select(SampleID=DB_Sample, P_Fat)) %>%
filter(!is.na(P_Fat)) %>%
group_by(Feature) %>%
do(
broom::tidy(cor.test(.$P_Fat,.$Abundance, method="spearman"))
) %>%
select(Feature, rho=estimate, Pvalue=p.value) %>%
ungroup() %>%
mutate(FDR=p.adjust(Pvalue, method="BH"))
Features<-Features %>% left_join(Cors) %>% as_tibble() %>% select(Type, Feature, HFD_mean, LFD_mean, logFC, MeanDecreaseAccuracy, MeanDecreaseGini, rho, Pvalue, FDR, Annotation)
write_tsv(Features,"figures/MostImportantFeatures.txt")
Nice.Table(Features)
Features %>%
filter(Type=="OTU") %>%
top_n(10, MeanDecreaseGini) %>%
mutate(Annotation=gsub("__"," ", Annotation)) %>%
mutate(Annotation=factor(Annotation, levels=Annotation)) %>%
ggplot(aes(x=Annotation, y=MeanDecreaseGini, fill=logFC)) +
geom_bar(stat="identity", color="black") +
theme_MicrobeR() +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
scale_fill_gradient2(low=LFDcolor, high=HFDcolor) +
theme(legend.position=c(0.9,0.7))
ggsave("figures/MostImpOTUs.pdf", device="pdf", height=8, width=6, useDingbats=F)
Here the importance of the node is being denoted by its shading while the OTUs are also being superimposed with their fold change as calculated above.
ptree<-drop.tip(tree, tree$tip.label[!tree$tip.label %in% Features$Feature]) #ptree<-tree #reduced for simplified viewing
plottree<-ggtree(ptree, layout = "fan", color="black") #
plottree<-plottree %<+% (Features %>% select(Feature, everything()))
# annotate a set of nodes of interest
interest<-c("p__Bacteroidetes","p__Firmicutes","g__Lactococcus")
for( i in interest){
tips<-taxonomy %>% filter(grepl(i, Taxonomy)) %>% filter(OTU %in% ptree$tip.label) %>% pull(OTU)
nd<-getMRCA(phy=ptree, tip=as.character(tips))
plottree<-
plottree + geom_hilight(
node=nd,
fill="grey50",
alpha=0.4
) +
geom_cladelabel(node=nd, label=i, offset = 0.03)
}
plottree +
geom_tippoint(aes(size=MeanDecreaseGini, fill=logFC), color="black", shape=21, alpha=0.9) +
scale_fill_gradient2(low=LFDcolor, high=HFDcolor) +
geom_nodepoint(aes(size=MeanDecreaseGini)) +
theme(legend.position="right") +
#geom_nodelab() +
geom_cladelabel(node=name.to.nn(ptree, "n12115"), label="x") +
geom_hilight(node=name.to.nn(ptree, "n12115"),fill="grey50",alpha=0.4) +
geom_cladelabel(node=name.to.nn(ptree, "n8187"), label="y") +
geom_hilight(node=name.to.nn(ptree, "n8187"),fill="grey50",alpha=0.4) +
geom_cladelabel(node=name.to.nn(ptree, "n7572"), label="z") +
geom_hilight(node=name.to.nn(ptree, "n7572"),fill="grey50",alpha=0.4)
ggsave("figures/rftree.pdf", device="pdf", height=9, width=11, useDingbats=F)
In this plot we see what appears to be two highly responsive clades marked x and y above. Will now try to better establish what exactly the taxonomy of these clades is as the Green Genes taxonomy is not informative. Taxonomy of the OTU sequences will be reclassified against a more recent version of the SILVA database using the taxonomic assignment tool of Dada2 which will look for perfect matches to assign species level taxonomy.
Clades<-
bind_rows(
tibble(Clade="x", OTU=extract.clade(ptree, "n12115")$tip.label),
tibble(Clade="y", OTU=extract.clade(ptree, "n8187")$tip.label),
tibble(Clade="z", OTU=extract.clade(ptree, "n7572")$tip.label)
)
gg<-readDNAStringSet("/Volumes/turnbaughlab/qb3share/jbisanz/HFD_metastudy/dbs/gg_13_8_otus/rep_set/97_otus.fasta")
gg<-gg[names(gg) %in% Clades$OTU]
gg<-tibble(OTU=names(gg), Sequence=as.character(gg))
Clades<-Clades %>% left_join(gg) %>% left_join(taxonomy %>% mutate(OTU=as.character(OTU)) %>% select(OTU, GG_taxonomy=Taxonomy))
silva<-dada2::assignTaxonomy(seqs = Clades$Sequence, refFasta = "/Volumes/turnbaughlab/qb3share/jbisanz/HFD_metastudy/dbs/SILVA_132/silva_nr_v132_train_set.fa.gz")
silva<-silva %>% as.data.frame() %>% rownames_to_column("Sequence") %>% mutate(Silva_taxonomy=paste(Kingdom, Phylum, Class, Order, Family, Genus, sep="; ")) %>% select(Sequence, Silva_taxonomy)
Clades<-Clades %>% left_join(silva) %>% select(Clade, OTU, GG_taxonomy, Silva_taxonomy, Sequence)
rm(gg, silva)
Nice.Table(Clades)
write_tsv(Clades, "figures/Clades.txt")
enrich<-clusterProfiler::enrichKEGG(Features %>% filter(Type=="KO") %>% pull(Feature), organism="ko", pAdjustMethod="BH")
enrich<-as.data.frame(enrich) %>%
as.tibble() %>%
filter(p.adjust<0.1)
Nice.Table(enrich)
Now we can plot the network.
d<-as.data.frame(enrich)
membership<-strsplit(d$geneID, split="/")
names(membership)<-d$ID
membership<-plyr::ldply(membership, data.frame)
colnames(membership)<-c("Pathway","Feature")
membership<-membership %>% left_join(Features) %>% left_join(d, by=c("Pathway"="ID"))
#http://kateto.net/network-visualization
#For each node, want: ID \t FDR \t log2FoldChange
nodes<-Features %>% filter(Type=="KO") %>% select(Feature, MeanDecreaseGini, logFC)
nodes$Type<-"Feature"
nodes<-nodes[nodes$Feature %in% unlist(strsplit(d$geneID, split="/")),]
dd<-d
dd$log2FoldChange<-NA
dd<-dd[,c("Description","log2FoldChange","p.adjust")]
dd$Type<-"Pathway"
colnames(dd)<-colnames(nodes)
nodes<-rbind(nodes, dd)
nodes<-nodes
links<-membership[,c("Description","Feature")]
net <- igraph::graph_from_data_frame(d=links, vertices=nodes, directed=F)
#plot_network(net) #using phyloseq's code
g<-net
edgeDF <- data.frame(igraph::get.edgelist(g))
edgeDF$id <- 1:length(edgeDF[, 1])
vertDF <- igraph::layout.auto(g)
colnames(vertDF) <- c("x", "y")
vertDF <- data.frame(value = igraph::get.vertex.attribute(g, "name"),
vertDF)
#extraData = nodes[as.character(vertDF$value), drop = FALSE]
#vertDF <- data.frame(vertDF, extraData)
vertDF<-vertDF %>% left_join(nodes, by=c("value"="Feature"))
graphDF <- merge(reshape2::melt(edgeDF, id = "id"), vertDF,
by = "value")
point_size="MeanDecreaseGini"
color="log2FC"
# vertDF$FDR.Colonization[vertDF$Type=="Pathway"]=0.001
#vertDF$log2FoldChange[vertDF$log2FoldChange>6]=6
#vertDF$log2FoldChange[vertDF$log2FoldChange<(-6)]=-6
vertDF<-vertDF %>% mutate(MeanDecreaseGini=if_else(Type=="Pathway", 12.5, MeanDecreaseGini))
Knet<-
ggplot() +
geom_line(data=graphDF, aes(x=x, y=y, group = id), color="grey", alpha=0.4) +
geom_point(data=vertDF, aes(x,y, fill = logFC, size = log2(MeanDecreaseGini), shape=Type), color="grey80") +
geom_text(data=subset(vertDF, Type=="Pathway"), aes(x,y, label = value), size = 2, hjust = 0, vjust=1) +
coord_equal() +
scale_fill_gradientn(colors=c("cornflowerblue", "white", "indianred", "darkred")) +
theme_minimal() +
theme(axis.text.x=element_blank(), axis.text.y=element_blank(),axis.ticks=element_blank()) +
theme(axis.title.x=element_blank(),axis.title.y=element_blank()) +
theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank()) +
scale_shape_manual(values=c(21, 23))
Knet
ggsave("figures/RFpicrust_wolabs.pdf", Knet, device="pdf", width=8, height=8)
Knet + geom_text(data=subset(vertDF, Type=="Feature"), aes(x,y, label = value), size = 2, hjust = 0, vjust=1)
ggsave("figures/RFpicrust_wlabs.pdf", device="pdf", width=8, height=8)
models<-lapply(RFdata, function(x){
randomForest(
x=t(x[rownames(x) %in% Features$Feature,subset(metadata, Task=="Murine Training Set")$SampleID]),
y=subset(metadata, Task=="Murine Training Set")$Diet_Classification)
})
models
## $KO
##
## Call:
## randomForest(x = t(x[rownames(x) %in% Features$Feature, subset(metadata, Task == "Murine Training Set")$SampleID]), y = subset(metadata, Task == "Murine Training Set")$Diet_Classification)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 14
##
## OOB estimate of error rate: 8.26%
## Confusion matrix:
## HFD LFD class.error
## HFD 256 19 0.06909091
## LFD 28 266 0.09523810
##
## $OTU
##
## Call:
## randomForest(x = t(x[rownames(x) %in% Features$Feature, subset(metadata, Task == "Murine Training Set")$SampleID]), y = subset(metadata, Task == "Murine Training Set")$Diet_Classification)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 15
##
## OOB estimate of error rate: 8.96%
## Confusion matrix:
## HFD LFD class.error
## HFD 251 24 0.08727273
## LFD 27 267 0.09183673
##
## $PhILR
##
## Call:
## randomForest(x = t(x[rownames(x) %in% Features$Feature, subset(metadata, Task == "Murine Training Set")$SampleID]), y = subset(metadata, Task == "Murine Training Set")$Diet_Classification)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 21
##
## OOB estimate of error rate: 7.21%
## Confusion matrix:
## HFD LFD class.error
## HFD 259 16 0.05818182
## LFD 25 269 0.08503401
In addition, will create a simple logistic model based solely on the F/B ratio.
FB<-Summarize.Taxa(
readRDS("/Volumes/turnbaughlab/qb3share/jbisanz/HFD_metastudy/Markdown/Diversity/RDS/OTUs.RDS")$Filtered,
readRDS("/Volumes/turnbaughlab/qb3share/jbisanz/HFD_metastudy/Markdown/Diversity/RDS/taxonomy.RDS")
)$Phylum + 0.5
RFdata$FBratio<-
FB %>%
as.data.frame() %>%
Make.Percent() %>%
as.data.frame() %>%
rownames_to_column("Phylum") %>%
as.tibble() %>%
filter(grepl("Firmicutes|Bacteroidetes", Phylum)) %>%
gather(-Phylum, key=Sample, value=abundance) %>%
spread(key=Phylum, value=abundance) %>%
mutate(log2FB=log2(`k__Bacteria;p__Firmicutes`/`k__Bacteria;p__Bacteroidetes`)) %>%
select(SampleID=Sample, log2FB) %>%
left_join(metadata)
models$FBratio<-glm(Diet_Classification~log2FB, data=subset(RFdata$FBratio, Task=="Murine Training Set"), family=binomial(link="logit"))
models$FBratio
##
## Call: glm(formula = Diet_Classification ~ log2FB, family = binomial(link = "logit"),
## data = subset(RFdata$FBratio, Task == "Murine Training Set"))
##
## Coefficients:
## (Intercept) log2FB
## 0.6135 -0.5487
##
## Degrees of Freedom: 568 Total (i.e. Null); 567 Residual
## Null Deviance: 788.2
## Residual Deviance: 648 AIC: 652
summary(models$FBratio)
##
## Call:
## glm(formula = Diet_Classification ~ log2FB, family = binomial(link = "logit"),
## data = subset(RFdata$FBratio, Task == "Murine Training Set"))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8721 -1.0163 0.4025 0.9634 2.2939
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.61346 0.10828 5.665 1.47e-08 ***
## log2FB -0.54871 0.05741 -9.558 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 788.17 on 568 degrees of freedom
## Residual deviance: 648.04 on 567 degrees of freedom
## AIC: 652.04
##
## Number of Fisher Scoring iterations: 4
saveRDS(models, "RDS/models.RDS")
Predictions<-list()
AUCs<-list()
for(Tasks in unique(metadata$Task)){
Predictions[[Tasks]]<-list()
AUCs[[Tasks]]<-list()
for(Types in c("PhILR","KO","OTU","FBratio")){
message(Tasks,"->",Types)
if(Types!="FBratio"){
tm<-predict(models[[Types]], newdata=t(RFdata[[Types]][,subset(metadata, Task==Tasks)$SampleID]), type="prob")[,2] %>%
prediction(., subset(metadata, Task==Tasks)$Diet_Classification) %>%
performance(., "tpr","fpr")
Predictions[[Tasks]][[Types]]<-
tibble(Task=Tasks, Type=Types, FPR=unlist(tm@x.values), TPR=unlist(tm@y.values))
auc<-predict(models[[Types]], newdata=t(RFdata[[Types]][,subset(metadata, Task==Tasks)$SampleID]), type="prob")[,2] %>%
prediction(., subset(metadata, Task==Tasks)$Diet_Classification) %>%
performance(., "auc")
AUCs[[Tasks]][[Types]]<-
tibble(
Task=Tasks,
Type=Types,
AUC=auc@y.values[[1]]
)
} else {
#for FBratio only
tm<-predict(object=models[[Types]], newdata=subset(RFdata[[Types]], Task==Tasks ), type="response") %>% # must select the appropriate samples
prediction(., subset(RFdata[[Types]], Task==Tasks )$Diet_Classification) %>%
performance(., "tpr","fpr")
Predictions[[Tasks]][[Types]]<-
tibble(Task=Tasks, Type=Types, FPR=unlist(tm@x.values), TPR=unlist(tm@y.values))
auc<-predict(object=models[[Types]], newdata=subset(RFdata[[Types]], Task==Tasks ), type="response") %>% # must select the appropriate samples
prediction(., subset(RFdata[[Types]], Task==Tasks )$Diet_Classification) %>%
performance(., "auc")
AUCs[[Tasks]][[Types]]<-
tibble(
Task=Tasks,
Type=Types,
AUC=auc@y.values[[1]]
)
}
}
}
Predictions<-lapply(Predictions, function(x) do.call(bind_rows, x)) %>% do.call(bind_rows, .)
AUCs<-lapply(AUCs, function(x) do.call(bind_rows, x)) %>% do.call(bind_rows, .)
Nice.Table(AUCs %>% spread(key=Type, value=AUC))
write_tsv(AUCs,"figures/AUCs.txt")
Predictions %>%
mutate(Type=factor(Type, levels=c("FBratio", "OTU","PhILR","KO"))) %>%
mutate(Task=factor(Task, levels=c("Murine Training Set","Murine Test Set","External Murine Sample","Humanized Mice","Human"))) %>%
ggplot(aes(x=FPR, y=TPR, color=Task, group=Task)) +
geom_abline(color="grey", linetype="dashed") +
geom_line() +
xlab("false positive") +
ylab("true positive") +
theme_MicrobeR() +
facet_grid(~Type) +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
coord_cartesian(expand=F)
ggsave("figures/ROCs.pdf", device="pdf", height=2, width=6, useDingbats=F)
AUCs %>%
mutate(Type=factor(Type, levels=c("OTU","PhILR","FBratio","KO"))) %>%
mutate(Task=factor(Task, levels=c("Murine Training Set","Murine Test Set","External Murine Sample","Humanized Mice","Human"))) %>%
ggplot(aes(x=Task, y=AUC, group=Type, fill=Type)) +
geom_bar(stat="identity", position=position_dodge(), color="black") +
theme_MicrobeR() +
coord_cartesian(ylim=c(0.4,1.02), expand=0) +
scale_fill_viridis_d()
ggsave("figures/Auc_plots.pdf", device="pdf", height=2, width=6, useDingbats=F)
#Now Task is really coding for Study
Predictions<-list()
AUCs<-list()
for(Tasks in unique(metadata$StudyID)[!unique(metadata$StudyID) %in% c("Anhe 2015","Howe 2016","Hu 2015","Ruan 2016","Lu 2017")]){ #must remove small studies with single samples in the test sets or not enough samples for meaningful number
Predictions[[Tasks]]<-list()
AUCs[[Tasks]]<-list()
for(Types in c("PhILR","KO","OTU","FBratio")){
message(Tasks,"->",Types)
if(Types!="FBratio"){
tm<-predict(models[[Types]], newdata=t(RFdata[[Types]][,subset(metadata, StudyID==Tasks & Task!="Murine Training Set")$SampleID]), type="prob")[,2] %>%
prediction(., subset(metadata, StudyID==Tasks & Task!="Murine Training Set" )$Diet_Classification) %>%
performance(., "tpr","fpr")
Predictions[[Tasks]][[Types]]<-
tibble(Task=Tasks, Type=Types, FPR=unlist(tm@x.values), TPR=unlist(tm@y.values))
auc<-predict(models[[Types]], newdata=t(RFdata[[Types]][,subset(metadata, StudyID==Tasks & Task!="Murine Training Set")$SampleID]), type="prob")[,2] %>%
prediction(., subset(metadata, StudyID==Tasks & Task!="Murine Training Set")$Diet_Classification) %>%
performance(., "auc")
AUCs[[Tasks]][[Types]]<-
tibble(
Task=Tasks,
Type=Types,
AUC=auc@y.values[[1]]
)
} else {
#for FBratio only
tm<-predict(object=models[[Types]], newdata=subset(RFdata[[Types]], StudyID==Tasks & Task!="Murine Training Set"), type="response") %>% # must select the appropriate samples
prediction(., subset(RFdata[[Types]], StudyID==Tasks & Task!="Murine Training Set")$Diet_Classification) %>%
performance(., "tpr","fpr")
Predictions[[Tasks]][[Types]]<-
tibble(Task=Tasks, Type=Types, FPR=unlist(tm@x.values), TPR=unlist(tm@y.values))
auc<-predict(object=models[[Types]], newdata=subset(RFdata[[Types]], StudyID==Tasks & Task!="Murine Training Set"), type="response") %>% # must select the appropriate samples
prediction(., subset(RFdata[[Types]], StudyID==Tasks & Task!="Murine Training Set")$Diet_Classification) %>%
performance(., "auc")
AUCs[[Tasks]][[Types]]<-
tibble(
Task=Tasks,
Type=Types,
AUC=auc@y.values[[1]]
)
}
}
}
Predictions<-lapply(Predictions, function(x) do.call(bind_rows, x)) %>% do.call(bind_rows, .)
AUCs<-lapply(AUCs, function(x) do.call(bind_rows, x)) %>% do.call(bind_rows, .)
ord<-AUCs %>% spread(key=Type, value=AUC) %>% as.data.frame() %>% column_to_rownames("Task") %>% dist(., "euclidian") %>% hclust()
AUCs %>%
mutate(Type=factor(Type, levels=c("FBratio","OTU","PhILR","KO"))) %>%
mutate(Task=factor(Task, levels=ord$labels[ord$order])) %>%
mutate(AUC=if_else(AUC<0.5, 0.5, AUC)) %>%
ggplot(aes(y=Task, x=Type, fill=AUC)) +
geom_tile() +
scale_fill_viridis_c() +
theme_MicrobeR() +
coord_cartesian(expand=0)
ggsave("figures/Auc_heatmap.pdf", device="pdf", height=5, width=3, useDingbats=F)
write_tsv(AUCs,"figures/AUCs_bystudy.txt")
For the purposes of this analysis will skip the feature selection and go straight to model and prediction.
models<-lapply(RFdata[1:3], function(x){
randomForest(
x=t(x[,subset(metadata, Task=="Humanized Mice")$SampleID]),
y=subset(metadata, Task=="Humanized Mice")$Diet_Classification)
})
models$FBratio<-glm(Diet_Classification~log2FB, data=subset(RFdata$FBratio, Task=="Humanized Mice"), family=binomial(link="logit"))
summary(models$FBratio)
##
## Call:
## glm(formula = Diet_Classification ~ log2FB, family = binomial(link = "logit"),
## data = subset(RFdata$FBratio, Task == "Humanized Mice"))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.85397 -0.31720 0.06231 0.32639 1.87615
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.800 1.644 2.92 0.0035 **
## log2FB -4.741 1.580 -3.00 0.0027 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 63.683 on 45 degrees of freedom
## Residual deviance: 25.728 on 44 degrees of freedom
## AIC: 29.728
##
## Number of Fisher Scoring iterations: 7
models
## $KO
##
## Call:
## randomForest(x = t(x[, subset(metadata, Task == "Humanized Mice")$SampleID]), y = subset(metadata, Task == "Humanized Mice")$Diet_Classification)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 83
##
## OOB estimate of error rate: 0%
## Confusion matrix:
## HFD LFD class.error
## HFD 22 0 0
## LFD 0 24 0
##
## $OTU
##
## Call:
## randomForest(x = t(x[, subset(metadata, Task == "Humanized Mice")$SampleID]), y = subset(metadata, Task == "Humanized Mice")$Diet_Classification)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 120
##
## OOB estimate of error rate: 4.35%
## Confusion matrix:
## HFD LFD class.error
## HFD 21 1 0.04545455
## LFD 1 23 0.04166667
##
## $PhILR
##
## Call:
## randomForest(x = t(x[, subset(metadata, Task == "Humanized Mice")$SampleID]), y = subset(metadata, Task == "Humanized Mice")$Diet_Classification)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 120
##
## OOB estimate of error rate: 2.17%
## Confusion matrix:
## HFD LFD class.error
## HFD 22 0 0.00000000
## LFD 1 23 0.04166667
##
## $FBratio
##
## Call: glm(formula = Diet_Classification ~ log2FB, family = binomial(link = "logit"),
## data = subset(RFdata$FBratio, Task == "Humanized Mice"))
##
## Coefficients:
## (Intercept) log2FB
## 4.800 -4.741
##
## Degrees of Freedom: 45 Total (i.e. Null); 44 Residual
## Null Deviance: 63.68
## Residual Deviance: 25.73 AIC: 29.73
Predictions<-list()
AUCs<-list()
for(Tasks in unique(metadata$Task)){
Predictions[[Tasks]]<-list()
AUCs[[Tasks]]<-list()
for(Types in c("PhILR","KO","OTU","FBratio")){
message(Tasks,"->",Types)
if(Types!="FBratio"){
tm<-predict(models[[Types]], newdata=t(RFdata[[Types]][,subset(metadata, Task==Tasks)$SampleID]), type="prob")[,2] %>%
prediction(., subset(metadata, Task==Tasks)$Diet_Classification) %>%
performance(., "tpr","fpr")
Predictions[[Tasks]][[Types]]<-
tibble(Task=Tasks, Type=Types, FPR=unlist(tm@x.values), TPR=unlist(tm@y.values))
auc<-predict(models[[Types]], newdata=t(RFdata[[Types]][,subset(metadata, Task==Tasks)$SampleID]), type="prob")[,2] %>%
prediction(., subset(metadata, Task==Tasks)$Diet_Classification) %>%
performance(., "auc")
AUCs[[Tasks]][[Types]]<-
tibble(
Task=Tasks,
Type=Types,
AUC=auc@y.values[[1]]
)
} else {
#for FBratio only
tm<-predict(object=models[[Types]], newdata=subset(RFdata[[Types]], Task==Tasks ), type="response") %>% # must select the appropriate samples
prediction(., subset(RFdata[[Types]], Task==Tasks )$Diet_Classification) %>%
performance(., "tpr","fpr")
Predictions[[Tasks]][[Types]]<-
tibble(Task=Tasks, Type=Types, FPR=unlist(tm@x.values), TPR=unlist(tm@y.values))
auc<-predict(object=models[[Types]], newdata=subset(RFdata[[Types]], Task==Tasks ), type="response") %>% # must select the appropriate samples
prediction(., subset(RFdata[[Types]], Task==Tasks )$Diet_Classification) %>%
performance(., "auc")
AUCs[[Tasks]][[Types]]<-
tibble(
Task=Tasks,
Type=Types,
AUC=auc@y.values[[1]]
)
}
}
}
Predictions<-lapply(Predictions, function(x) do.call(bind_rows, x)) %>% do.call(bind_rows, .)
AUCs<-lapply(AUCs, function(x) do.call(bind_rows, x)) %>% do.call(bind_rows, .)
Nice.Table(AUCs %>% spread(key=Type, value=AUC))
write_tsv(AUCs,"figures/AUCs_hum.txt")
Predictions %>%
mutate(Type=factor(Type, levels=c("OTU","PhILR","FBratio","KO"))) %>%
mutate(Task=factor(Task, levels=c("Murine Training Set","Murine Test Set","External Murine Sample","Humanized Mice","Human"))) %>%
ggplot(aes(x=FPR, y=TPR, color=Task, group=Task)) +
geom_abline(color="grey", linetype="dashed") +
geom_line() +
xlab("false positive") +
ylab("true positive") +
theme_MicrobeR() +
facet_grid(~Type) +
theme(axis.text.x=element_text(angle=45, hjust=1))
ggsave("figures/ROCs_hum.pdf", device="pdf", height=2, width=6, useDingbats=F)
AUCs %>%
mutate(Type=factor(Type, levels=c("FBratio","OTU","PhILR","KO"))) %>%
mutate(Task=factor(Task, levels=c("Murine Training Set","Murine Test Set","External Murine Sample","Humanized Mice","Human"))) %>%
ggplot(aes(x=Task, y=AUC, group=Type, fill=Type)) +
geom_bar(stat="identity", position=position_dodge(), color="black") +
theme_MicrobeR() +
coord_cartesian(ylim=c(0.4,1.02), expand=0) +
scale_fill_viridis_d()
ggsave("figures/Auc_plots_hum.pdf", device="pdf", height=2, width=6, useDingbats=F)
save.image(paste0("RDS/Session",format(Sys.time(),"%d%b%Y_%H%M%S"),".Rdata"))
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 7874757 420.6 14143625 755.4 NA 14143625 755.4
## Vcells 55653043 424.6 190920964 1456.7 16384 190920024 1456.7