-
-
Save kviljoen/97d36c689c5c9b9c39939c7a100720b9 to your computer and use it in GitHub Desktop.
library(phyloseq) | |
library(pheatmap) | |
library(vegan) | |
library(ggplot2) | |
library(corrplot)#for cor.mtest | |
library(psych)#corr.test | |
library(matrixStats)#rowSds | |
library(metagenomeSeq)#differential abundance testing | |
library(randomForest) | |
library(dplyr) | |
library(ROCR) | |
library(caret) | |
library(gridExtra)#for grid.arrange() | |
#DEFINE CUSTOM COLOR PALETTE WHERE COLORS ARE EASIER TO DISTINGUISH FROM ONE ANOTHER | |
myPalette <- c('#89C5DA', "#DA5724", "#74D944", "#CE50CA", "#3F4921", "#C0717C", "#CBD588", "#5F7FC7", "#673770", "#D3D93E", "#38333E", "#508578", "#D7C1B1", "#689030", "#AD6F3B", "#CD9BCD", "#D14285", "#6DDE88", "#652926", "#7FDCC0", "#C84248", "#8569D5", "#5E738F", "#D1A33D", "#8A7C64", "#599861") | |
#??????????????????????????????????????????????????????????? | |
#SUMMARY OF FUNCTIONS AVAILABLE: | |
#1. make_metagenomeSeq() - converts phyloseq to metagenomeSeq object | |
#2. tax.lab() - GET LOWEST LEVEL OF TAXONOMIC ANNOTATION AVAILABLE TO SUBSTITUTE OTUS FOR PLOTTING PURPOSES. | |
#3. heatmap.k() - Generic heatmap function for phyloseq objects using package NMF | |
#4. bar.plots() - pretty generic barplot function built on phyloseq's plot_bar() | |
#5. tax_glom.kv() - GET LOWEST LEVEL OF TAXONOMIC ANNOTATION AVAILABLE AND MERGE COUNTS AT THIS LEVEL | |
#6. super.fitZig.kv() - FUNCTION BUILT AROUND METAGENOMESEQ'S FITZIG() AND MRFULLTABLE() FUNCTIONS INCLUDING CUSTOM FILTERING AND HEATMAP OF SIGNIFICANT RESULTS (currently only setup for 2-class comparisons) | |
#7. corr.k(): FUNCTION TO PERFORM PAIRWISE CORRELATIONS, TEST FOR SIGNIFICANCE AND PLOT CORRELLOGRAM | |
#8. MC.summary(): take the output from PICRUSt's metagenome_contributions.py, together with taxonomic annotation for the OTUs included in | |
#9. #RF.k(): performs random forests analysis on the otu table of a supplied phyloseq object. | |
#NB: this function is only setup for two-class categorical response variables NOT regression on continuous response variables | |
#??????????????????????????????????????????????????????????? | |
#********************************** | |
#make_metagenomeSeq: THIS METAGENOME SEQ FUNCTION WAS COPIED AND MODIFIED FROM CODE USED FOR THE MCMURDIE 2014 PAPER - http://joey711.github.io/waste-not-supplemental/simulation-differential-abundance/simulation-differential-abundance-server.html | |
#********************************** | |
# Function to convert from phyloseq object to metagenomeSeq object | |
make_metagenomeSeq = function(physeq) { | |
require("metagenomeSeq") | |
require("phyloseq") | |
# Enforce orientation | |
if (!taxa_are_rows(physeq)) { | |
physeq <- t(physeq) | |
} | |
OTU = as(otu_table(physeq), "matrix") | |
# Convert sample_data to AnnotatedDataFrame | |
ADF = AnnotatedDataFrame(data.frame(sample_data(physeq))) | |
# define dummy 'feature' data for OTUs, using their name Helps with | |
# extraction and relating to taxonomy later on. | |
TDF = AnnotatedDataFrame(data.frame(OTUname = taxa_names(physeq), row.names = taxa_names(physeq))) | |
# Create the metagenomeSeq object | |
MGS = newMRexperiment(counts = OTU, phenoData = ADF, featureData = TDF) | |
# Trigger metagenomeSeq to calculate its Cumulative Sum scaling factor. | |
MGS = cumNorm(MGS) | |
return(MGS) | |
} | |
#********************************** | |
#tax.lab() - GET LOWEST LEVEL OF TAXONOMIC ANNOTATION AVAILABLE TO SUBSTITUTE OTUS FOR PLOTTING PURPOSES. | |
#********************************** | |
#ARGUMENTS: | |
#otus = otu table of interest (often subset) | |
#physeq = phyloseq object of interst (for tax annotation) | |
#labrow: TRUE=annotate rows with taxonomy info, FALSE=annotate rows with OTU info. | |
#merged: if OTUs have been merged, level of merge e.g. "Genus", else FALSE | |
tax.lab <- function(otus, physeq, labrow=TRUE,merged=FALSE){ | |
#First delete columns with only NAs (generated for merged tax tables e.g. at Family level will have NA Genus, Species cols) | |
temp_tax <- tax_table(physeq) | |
temp_tax <- temp_tax[, colSums(is.na(temp_tax)) != nrow(temp_tax)] | |
tax <- data.frame(temp_tax) | |
tax <- tax[c(rownames(otus)),] | |
tax[is.na(tax)] <- "" | |
if(merged==FALSE){ | |
tax$unique <- rep("",dim(tax)[1])#make unique names | |
for(i in 1:dim(tax)[1]){ | |
x = 7#column 7 = Species | |
tax$unique[i] <- ifelse(tax[i,x]==""|tax[i,x]=="NA"|is.na(tax[i,x])==TRUE, | |
ifelse(tax[i,x-1]==""|tax[i,x-1]=="NA"|is.na(tax[i,x-1])==TRUE, | |
ifelse(tax[i,x-2]==""|tax[i,x-2]=="NA"|is.na(tax[i,x-2])==TRUE, | |
ifelse(tax[i,x-3]==""|tax[i,x-3]=="NA"|is.na(tax[i,x-3])==TRUE,#phylum | |
ifelse(tax[i,x-4]==""|tax[i,x-4]=="NA"|is.na(tax[i,x-4])==TRUE, as.character(tax[i,x-5]), | |
as.character(tax[i,x-4])),#class | |
as.character(tax[i,x-3])),#order | |
as.character(tax[i,x-2])),#family | |
as.character(tax[i,x-1])),#genus | |
as.character(tax[i,x]))#species | |
} | |
#Custom cleanup - species: | |
#For OTUs with species-level annotation: get the first character of the Genus and append to species. | |
#first remove all "[]"brackets | |
tax$Genus = sub("\\[","",tax$Genus) | |
tax$Genus = sub("\\]","",tax$Genus) | |
for(i in 1:dim(tax)[1]){ | |
if(is.na(tax$Species[i])==FALSE & tax$Species[i]!="" & tax$Species[i]!="NA"){#get OTUs with species-level annotation | |
get = substr(tax$Genus[i],1,1) | |
tax$unique[i] = sub(tax$unique[i],paste0(get,".",tax$unique[i]),tax$unique[i]) | |
} | |
} | |
tax$unique <- make.unique(tax$unique) | |
if(labrow==TRUE){ | |
labs = tax$unique | |
}else if(labrow==FALSE){ | |
labs = rownames(otus) | |
} | |
}else if(length(merged)>0){ | |
if(labrow==TRUE){ | |
labs=tax[,merged] | |
}else if(labrow==FALSE){ | |
labs = rownames(otus) | |
} | |
} | |
return(labs) | |
} | |
#********************************** | |
#heatmap.k() Generic heatmap function for phyloseq | |
#objects using package pheatmap | |
#********************************** | |
#ARGUMENTS: | |
#physeq = phyloseq object of interst (for tax annotation) - only applicable if you're plotting OTUs. | |
#- | |
#plot.otus = are you plotting otus, true or false, default = TRUE | |
#- | |
#data.subset - either a dataframe with rownames = the otus you want to plot (typically the significant results table from metagenomeseq) | |
#OR a dataframe with some other data you want to plot e.g. cytokines, flow cyt, gene expression | |
#- | |
#subset.samples = names of samples to include in heatmap, default = NULL, i.e. use all samples | |
#annot.cols = columns of interest in sample_data(physeq) for annotation track | |
#colours = typical NMF color specification for columns - see NMF reference manual | |
#order.by = column number by which to order columns (variable of interest), default = NULL (i.e. perform hierarchical clustering) | |
#main = Title of plot (analysis details) | |
#subt = subtitle of plot (threshold info) | |
#filename = filename | |
#Colv = should columns be clustered, default = no | |
#labrow: TRUE=annotate rows with taxonomy info, FALSE=annotate rows with OTU info. - only applicable if you're plotting OTUs | |
#if OTUs have been merged, TRUE, else FALSE - only applicable if you're plotting OTUs | |
#merged = if the data has been merged at a given taxonomic level - supply column name e.g. "Genus" | |
#distfun = distance function used, default 'euclidian' | |
#hclustfun = clustering function used, default 'average' | |
#scale: see pheatmap() from pheatmap package for details. Should heatmap rows/columns be scaled? | |
heatmap.k = function(physeq=NULL, plot.otus = TRUE,data.subset=NULL,subset.samples = NULL, | |
annot.cols = NULL,colours = NULL, order.by = NULL,main=NULL, subt=NULL,filename, | |
Colv = TRUE,rows.sortby=TRUE,labrow = FALSE, merged = FALSE, distfun = 'euclidean', | |
hclustfun = 'average', | |
cexCol=1, cexRow=1, scale = "none"){ | |
if(plot.otus==TRUE){ | |
#If plotting OTUs - get OTUs | |
otus <- otu_table(physeq) | |
if(length(subset.samples)==0){ | |
print("including all samples")#do nothing | |
}else if(length(subset.samples) >0){#if there was a subset specified: | |
physeq <- prune_samples(sample_names(physeq)%in% subset.samples,physeq) | |
otus <- otus[,colnames(otus) %in% subset.samples] | |
} | |
if(length(data.subset)==0){ | |
print("including all otus")#do nothing | |
}else if(length(data.subset) >0){#if an otus subset was specified | |
otus <- otus[rownames(otus) %in% rownames(data.subset),] | |
} | |
#log2 transform otus for better heatmap visualisation colour scale | |
zs <- which(otus==0)#find zero entries | |
if(length(zs)>0){#if there are zero values, transform | |
otus[zs] <- 1 #set zs to 1 pseudocount | |
otus <- log2(otus) | |
} else{otus <- log2(otus)} | |
plot <- otus#data to be plotted | |
#-- | |
}else if(plot.otus==FALSE){ | |
plot = data.subset#this is for non-otu data | |
if(length(colnames(plot) %in% sample_names(physeq))>0){ | |
#make sure the table is in the correct orientation | |
}else{plot = t(plot)}#transform so that sample names are column names not rownames | |
#make sure to subset the data appropriately to include only samples in data.subset | |
sample_data(physeq) <- sample_data(physeq)[rownames(sample_data(physeq)) %in% colnames(plot),] | |
} | |
#get annotation | |
pheno <- sample_data(physeq) | |
a.track <- pheno[,annot.cols]#select columns of interst | |
if(length(order.by)==0){ | |
#do nothing | |
}else {a.track <- a.track[order(a.track[,order.by][[1]], decreasing = TRUE),]}#sort by variable of interest | |
#make sure the samples used in the data to plot match that of the annotation track | |
plot = plot[,rownames(a.track)] | |
#sort ROWS by column of interest? (e.g. if you want to sort by p-value or coefficient size) | |
if(length(rows.sortby)!=0){ | |
#this is only for results from super.fitZig.kv and makes use of the column specified in 'rows.sortby' | |
if(length(which(colnames(data.subset)==rows.sortby))==1){# | |
ids.order <- rownames(data.subset[order(abs(data.subset[,rows.sortby])),]) | |
plot <- plot[ids.order,] | |
rows.sortby = NA | |
} | |
} | |
#------------------- | |
#row tax annotation: if plotting otus | |
if(plot.otus==TRUE){ | |
labs <- tax.lab(physeq=physeq,otus=plot,labrow, merged = merged) | |
}else{labs = rownames(plot)} | |
#Write to file | |
plot <- data.frame(plot) | |
a.track <- data.frame(a.track) | |
pdf(filename) | |
pheatmap(mat=plot, main = main, | |
sub = subt, | |
scale = scale, | |
cluster_cols = Colv, | |
cluster_rows = rows.sortby, | |
annotation_col = a.track, | |
annotation_colors = colours, | |
labels_row = labs, | |
clustering_distance_cols = distfun, | |
clustering_method=hclustfun, | |
fontsize=10, | |
fontsize_col=cexCol, | |
fontsize_row=cexRow) | |
dev.off() | |
} | |
#********************************** | |
#bar.plots() - generic barplot function build on phyloseq plot_bar(): | |
#NB number of taxa that can be displayed currently limited to 26 (number defined in myPalette at start of script) | |
#********************************** | |
#This function was modified from the phyloseq plot_bar() function where ggplot2's geom_bar no longers sorted stacked bars by abundance. | |
#Don't call plot_bar_fix directly, it gets called by bar.plots() | |
#--- | |
#ARGUMENTS for bar.plots() | |
#physeq: phyloseq object, standardized counts | |
#cat: category of interest | |
#level: level at which to perform taxa merging | |
#perc: minimum fraction of samples which should be positive for a given taxa (stringent filtering (perc=0.5) used to minimize number of taxa in figure legend, and only display most abundant taxa) | |
#subset.otus = any dataframe for which the rownames will be used to subset otu.table (stats.sig) | |
#subset.samples = names of samples to include in barplot | |
#count = minimum per taxon count summed across all samples | |
#order.bars = option to order barplots - sometimes relavant if more than 2 categories (e.g. order= c("classA","classB","classC") | |
#x.axis =x axis labels, | |
#y.axis = y axis labels, | |
#filen = any other filename additions such as project name or date | |
plot_bar_fix <- function (physeq, x = "Sample", y = "Abundance", fill = level, | |
title = NULL, facet_grid = NULL) | |
{ | |
mdf = psmelt(physeq) | |
p=ggplot(mdf[order(-mdf[,y]),],aes_string(x= x, y=y, fill = fill, group = y)) + geom_bar(stat='identity') | |
p = p + theme(axis.text.x = element_text(angle = -90, hjust = 0)) | |
if (!is.null(facet_grid)) { | |
p <- p + facet_grid(facet_grid) | |
} | |
if (!is.null(title)) { | |
p <- p + ggtitle(title) | |
} | |
return(p) | |
} | |
bar.plots <- function(physeq,subset.otus = NULL, subset.samples = NULL, count,cat, | |
level, perc, order.bars = NULL,x.axis=NULL, y.axis=NULL, merge.samples = FALSE, | |
filen = "",outDir,plot_title=NULL,pdf_height=7, pdf_width=7){ | |
if(length(subset.samples)==0){ | |
}else if (length(subset.samples) >0){#if there was a subset specified: | |
physeq <- prune_samples(sample_names(physeq)%in% subset.samples,physeq) | |
} | |
if(length(subset.otus)==0){ | |
}else if(dim(subset.otus)[1] >0){#if an otus subset was specified | |
physeq <- prune_taxa(rownames(tax_table(physeq)) %in% rownames(subset.otus),physeq) | |
} | |
#MERGE OTUS AT USER-SPECIFIED LEVEL | |
merged <- tax_glom(physeq, level) | |
#filter taxa using the same filter used for diff. abundance testing | |
merged = filter_taxa(merged, function(x) sum(x > count) >= (perc*length(x)), TRUE)#filter taxa for barplot display | |
if(merge.samples==FALSE){ | |
new <- transform_sample_counts(merged, function(x) 100 * x/sum(x))#convert to % abundance | |
}else{ | |
new <- merge_samples(merged,cat, fun = "median")#Merge by category | |
new <- transform_sample_counts(new, function(x) 100 * x/sum(x))#convert to % abundance | |
sample_data(new)[,cat] <- rownames(sample_data(new)) #merge_samples also "merges the sample table, replace relevant cat column with rownames | |
} | |
p2 <- plot_bar_fix(physeq=new,x = cat, fill=level ,title = plot_title)+ coord_flip() + | |
ylab("Percentage of Sequences")+theme(axis.text=element_text(size=14), | |
axis.title=element_text(size=14),legend.title=element_text(size=16), | |
legend.text=element_text(size=14), | |
plot.title=element_text(size=14, face="bold"))+scale_x_discrete(limits=c(order.bars))+scale_fill_manual(values=myPalette)+theme_bw()+ | |
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) | |
p2 = p2+labs(x.axis=x.axis, y.axis=y.axis) | |
pdf(paste0(outDir,"/",filen,"",level,"_abundance_by_",x.axis,perc,"_",count,".pdf"),pdf_width,pdf_height) | |
par(cex.lab = 2) | |
par(cex.axis = 2) | |
grid.arrange(p2, ncol=1) | |
dev.off() | |
return(p2) | |
} | |
#********************************** | |
##tax_glom.kv - combination of phyloseq's tax_glom() and my tax.lab() function | |
#GET LOWEST LEVEL OF TAXONOMIC ANNOTATION AVAILABLE AND MERGE COUNTS AT THIS LEVEL (i.e. I have 3 otus of L.iners and another 3 of Lactobacillus (unknown species) --> | |
#e.g. merge L.iners at species level and Lactobacillus (no species annot) at genus level. | |
#modified from phyloseq function tax_glom | |
#NB: this works for when we have the standard 7 ranks | |
#colnames(tax_table(physeq)) | |
#[1] "Kingdom" "Phylum" "Class" "Order" "Family" "Genus" "Species" | |
#********************************** | |
#ARGUMENTS: | |
#otus = otu table of interest, if not specified, defaults to OTU table of the phyloseq obj. specified. | |
#physeq = phyloseq object with taxonomic annotation | |
tax_glom.kv <- function (physeq) | |
{ | |
otus = otu_table(physeq) | |
if (is.null(access(physeq, "tax_table"))) { | |
stop("The tax_glom.kv() function requires that physeq contain a taxonomyTable") | |
} | |
#check if there is a sample_data slot - function merge_phyloseq() downstream requires a sample_data slot (bug?) | |
remove.SD = FALSE | |
remove.SD2 = FALSE | |
if (is.null(access(physeq, "sam_data"))) { | |
# Dummy sample data | |
SD = sample_data(data.frame(sampnames=sample_names(physeq), sampnames2=sample_names(physeq))) | |
rownames(SD) = sample_names(physeq) | |
# Add the dummy sample data | |
sample_data(physeq) <- SD | |
remove.SD = TRUE #use later to remove dummy sample data before function return | |
}else if (dim(sample_data(physeq))[2] == 1){ | |
#If there is a sample_data slot but with only one column, create an additional dummy column - function merge_phyloseq() downstream requires | |
sample_data(physeq)[,2] = sample_names(physeq)#dummy column with sample names | |
remove.SD2 = TRUE | |
} | |
#Check for taxonomy table, fail if none | |
if (is.null(access(physeq, "tax_table"))) { | |
stop("The tax_glom.kv() function requires that physeq contain a taxonomyTable") | |
} | |
#check if there is a phylogenetic tree in the phyloseq object - it needs to be removed first otherwise the function will break trying to 'merge' the tree | |
if (!is.null(access(physeq, "phy_tree"))) { | |
print("Removing phylogenetic tree") | |
physeq@phy_tree <- c() | |
} | |
tax.k <- data.frame(tax_table(physeq))#extract tax table from phyloseq obj | |
#check otu table orientation, change if necessary so that taxa are rows | |
if (length(intersect(colnames(otus),rownames(tax.k))>0)){#OTU/ASVs should be rows in otu table, not columns | |
otus <- t(otus) | |
} | |
tax.k <- tax.k[c(rownames(otus)),]#reorder to match OTUs in specified otu table | |
levels = colnames(tax_table(physeq))#available taxonomic levels to consider | |
for(i in 1:dim(tax.k)[1]){ | |
x = length(levels)#x=7 (Species column) | |
tax.k$lowest[i] <- ifelse(tax.k[i,x]==""|tax.k[i,x]=="NA"|is.na(tax.k[i,x])==TRUE, | |
ifelse(tax.k[i,x-1]==""|tax.k[i,x-1]=="NA"|is.na(tax.k[i,x-1])==TRUE, | |
ifelse(tax.k[i,x-2]==""|tax.k[i,x-2]=="NA"|is.na(tax.k[i,x-2])==TRUE, | |
ifelse(tax.k[i,x-3]==""|tax.k[i,x-3]=="NA"|is.na(tax.k[i,x-3])==TRUE, | |
ifelse(tax.k[i,x-4]==""|tax.k[i,x-4]=="NA"|is.na(tax.k[i,x-4])==TRUE, "Phylum", | |
"Class"),#class | |
"Order"),#order | |
"Family"),#family | |
"Genus"),#genus | |
"Species")#species | |
#so now tax.k should be a vector of the lowest available level of annotation for each OTU (e.g. "Phylum", "Species, "Species", "Class"...) | |
} | |
#create empty list to store phyloseq sub objects (list of lists) | |
phy.list <- list() | |
#get the levels that are actually present in tax.k$lowest | |
l <- unique(tax.k$lowest) | |
for(i in 1:length(l)){ | |
tax.k.level <- tax.k[tax.k$lowest==l[i],]#subset tax.k by a specific taxonomic rank | |
phy.k.level <- prune_taxa(rownames(tax.k.level),physeq)#now subset the phyloseq object to contain only the otus in tax.k.level | |
phy.k.merged <- tax_glom(phy.k.level,taxrank=l[i])#now merge at levels[i] | |
phy.list[i] <- phy.k.merged | |
} | |
#now take the list of lists and merge it into one phyloseq object | |
i=1 | |
while(i < length(phy.list)){ | |
if(i==1){ | |
phy.new <- merge_phyloseq(phy.list[[i]], phy.list[[i+1]])#first round of merging to create new object named phy.new | |
}else { | |
phy.new <- merge_phyloseq(phy.new, phy.list[[i+1]])#then subsequently append to phy.new | |
} | |
i = i+1 | |
} | |
#remove dummy sample data where applicable | |
if (remove.SD==TRUE) { | |
sample_data(phy.new) <- c() | |
} | |
if (remove.SD2==TRUE) { | |
sample_data(phy.new) <- sample_data(phy.new)[,1] | |
} | |
# | |
print(paste("There are now",ntaxa(phy.new),"merged taxa")) | |
return(phy.new) | |
} | |
#********************************** | |
#super.fitZig.kv(): FUNCTION BUILT AROUND METAGENOMESEQ'S FITZIG() AND MRFULLTABLE() FUNCTIONS INCLUDING HEATMAP OF SIGNIFICANT RESULTS | |
#NB - CURRENTLY ONLY SETUP TO HANDLE TWO-CLASS CATEGORICAL COMPARISONS | |
#********************************** | |
#---------------------------------- | |
#ARGUMENTS: | |
#physeq = phyloseq object with raw counts but pruned by read count (e.g exclude samples with < 10 000 reads) | |
#factor = the factor of interest (This should be a column name in sample_data(physeq)) | |
#FileName = specify comparison made, not full file path (e.g. "M.prune_PICRUSt_groups_C_vs_A") | |
#p = adjusted p-value threshold to use when filtering significant results. | |
#default p=0.05 | |
#perc = threshold for filtering by fraction of +ve ssamples for a given OTU in either group (e.g perc = 0.3 will keep only OTUs where at least one of the two groups have ? 30% of samples +ve for that OTU) | |
#default perc = 0.5 | |
#FC = absolute coefficient threshold to use for filtering (default = 1.5) | |
#main = title of heatmap | |
#subt = subtitle for heatmap (most commonly filtering settings e.g."FDR < 0.05,|coeff| >= 1.5, >30%+ in either group" | |
#heatmap.describer = heatmap filename segment specifying clustering or not as well as type of annotation (OTU ID/taxonomy) (e.g."category_ordered_tax_annot.pdf" ) | |
#order = should heatmap columns be sorted manually? TRUE/FALSE, default=TRUE | |
#rows.sortby = would you like to sort the resulting heatmap by a column from the rsults table e.g. "adjPavalues" or "coeff" | |
#labrow = should the heatmap be annotated with taxonomic annotations for OTU ids (TRUE/FALSE; TRUE = taxonomic annotations; FALSE = OTU IDs) | |
#extra.cols = any extra columns that you would like to plot on the heatmap e.g. c("feeding","HIV_exposure") | |
#cexCol=scaling factor for heatmap column labels (usually sample names) | |
#cexRow=scaling factor for heatmap row labels (usually taxon names) | |
#merged: default FALSE; if data has first been merged supply relevant tax level column name e.g. "Genus" | |
#----------------------------------- | |
super.fitZig.kv <- function(physeq, factor, covariate=NULL,outDir,FileName,heatmap.descriptor=NULL, | |
main=NULL, subt=NULL, ordered=TRUE, rows.sortby = TRUE,p=0.05, FC = 1.5, perc=0.3, | |
merged = FALSE, extra.cols = NULL,colours=NULL, cexCol=12, cexRow=12){ | |
#------------------------------- | |
#........................... | |
#remove any samples with no data for the factor (or covariate) | |
sub.index <- which(sample_data(physeq)[,factor] !='NA'& sample_data(physeq)[,factor] !='<NA>'& sample_data(physeq)[,factor] !=''& sample_data(physeq)[,factor] !=' ') | |
l = dim(sample_data(physeq))[1] | |
if(length(covariate)!=0){ | |
sub.index.2 <- which(sample_data(physeq)[,covariate] !='NA'& sample_data(physeq)[,covariate] !='<NA>'& sample_data(physeq)[,covariate] !=''& sample_data(physeq)[,covariate] !=' ') | |
keep = intersect(sub.index, sub.index.2) | |
physeq <- prune_samples(sample_names(physeq)[keep],physeq) | |
print(paste(l-length(keep),"of",l,"samples were removed due to missing data")) | |
}else{ | |
physeq <- prune_samples(sample_names(physeq)[sub.index],physeq) | |
print(paste(l-length(sub.index),"of",l,"samples were removed due to missing data")) | |
} | |
#........................... | |
#convert phyloseq object to metagenomeSeq object | |
MGS <- make_metagenomeSeq(physeq)# | |
#........................... | |
#Check variables | |
levels = levels(pData(MGS)[,factor]) | |
if(length(covariate)==1){ | |
if(is.character(pData(MGS)[,covariate])==TRUE){#error message if covariate is not either numeric OR a factor variable | |
stop(print("the covariate",covariate,"is a character variable, please convert to factor or numeric first")) | |
} | |
} | |
if(length(levels)==2){#if we have a factor variable with two levels | |
print(paste(factor,"will be modeled as a binary categorical predictor variable")) | |
f = TRUE | |
}else if(length(levels) > 2){ | |
stop("invalid: your factor has more than two levels")#exit function if factor specified does not have exactly two levels | |
}else if(length(levels)==0){#then not a factor variable | |
if(is.numeric(pData(MGS)[,factor])==TRUE){ | |
print(paste(factor,"will be modeled as a continuous predictor variable")) | |
f=FALSE | |
}else if(is.character(pData(MGS)[,factor])==TRUE){#then need to know if data should be numeric or categorical | |
stop(paste(factor,"is a character variable, please convert to numeric or factor data first")) | |
} | |
} | |
#DATA NORMALISATION using the cumNormStat function from the metagenomeSeq package, default settings | |
par(mar=c(1,1,1,1)) | |
MGSp = cumNormStat(MGS,pFlag=TRUE,main="Trimmed data") | |
MGS = cumNorm(MGS,p=MGSp) | |
#------------------------------- | |
#------------------------------- | |
#Build model based on info specified in the variable 'factor' | |
fct=pData(MGS)[,factor] | |
if(length(covariate)!=0){ | |
cov.=pData(MGS)[,covariate] | |
mod = model.matrix(~fct+cov.)#including covariate | |
}else{mod = model.matrix(~fct)} | |
rownames(mod) <- rownames(pData(MGS)) | |
#------------------------------- | |
#fitZig | |
settings = zigControl(maxit=100,verbose=TRUE) | |
fit = fitZig(obj = MGS,mod=mod, | |
control=settings) #by default, the normalising factors for obj=MGS are included in the model | |
#------------------------------- | |
#CATEGORICAL VARIABLE | |
if(f==TRUE){ | |
stats <- MRfulltable(fit, coef=colnames(mod)[2],by=colnames(mod)[2],number = dim(fData(MGS))[1],group=2)#data reported for second column in the model | |
stats = stats[!is.na(rownames(stats)), ]# if any OTUs left out, rm those from x. Detected by NA rownames. | |
}else if(f==FALSE){ | |
#OR CONTINUOUS VARIABLE | |
stats <- MRcoefs(fit, coef=colnames(mod)[2],by=colnames(mod)[2], number = dim(fData(MGS))[1], group=2)# | |
} | |
#------------------------------- | |
#NEXT ADD TAXONOMIC DATA FOR SIGNIFICANT OTUs | |
#check for NA taxonomic columns, e.g. if merged at genus level, species column will be NAs | |
na_counts <- apply(tax_table(physeq),2,function(x) length(which(is.na(x))) == dim(tax_table(physeq))[1]) | |
if(length(which(na_counts==TRUE))>0){#If there is an NA column(s), remove these | |
na_cols <- which(na_counts==TRUE) | |
tax_table(physeq) <- tax_table(physeq)[,-c(na_cols)] | |
} | |
tax.tab <- data.frame(tax_table(physeq)) | |
tax <- tax.tab[c(rownames(stats)),] | |
stats <- cbind(stats, tax) | |
#change 'NA' column to coeff. | |
n <- which(is.na(colnames(stats))) | |
colnames(stats)[n] <- 'coeff' | |
#FILTER RESULTS: | |
#A) p-values filter | |
if(f==TRUE){ | |
stats.sig <- stats %>% filter(adjPvalues <= p | fisherAdjP <= p) | |
}else if(f==FALSE){ | |
stats.sig <- stats %>% filter(adjPvalues <= p ) | |
} | |
#B) FC filter | |
stats.sig <- stats.sig %>% filter(abs(coeff) >= FC) | |
if(nrow(stats.sig)==0){ | |
stop("no significant results at the specified filtering criteria") | |
} | |
#print results summary to screen: | |
if(f==FALSE){ | |
cat(paste("There were ",dim(stats.sig)[1],"OTUs/ASVs significantly different by",factor, | |
"that met",'\n',"threshold criteria of p",p,"absolute FC",FC)) | |
} | |
#GROUP SUMMARIES FOR CATEGORICAL VARIABLES | |
if(f==TRUE){ | |
#only keep results where at least one of the 2 groups has a threshold proportion of + samples: | |
N1 <- length(which(mod[,2]==1))# | |
N2 <- length(which(mod[,2]==0))# | |
N1+N2# | |
g1 <- grep("group 1",colnames(stats.sig)[1:2])# | |
g0 <- grep("group 0",colnames(stats.sig)[1:2])# | |
#FILTER BY SPECIFIED % PRESENCE IN AT LEAST ONE GROUP (DEFAULT=30%) | |
stats.sig<- stats.sig %>% filter(stats.sig[,g1] >= perc*N1 | fisherAdjP <= p | stats.sig[,g0] >= perc*N2) | |
if(nrow(stats.sig)==0){ | |
stop("no significant results at the specified filtering criteria") | |
} | |
#CHANGE "+samples in group1/0" columns to instead reflect mean count calculated across positive samples. | |
g1.1 <- grep("group 1",colnames(stats.sig)[3:4]) | |
g1.1 <- g1.1+2 | |
g0.1 <- grep("group 0",colnames(stats.sig)[3:4]) | |
g0.1 <- g0.1+2 | |
for(i in 1:dim(stats.sig)[1]){ | |
stats.sig [i,g0.1] <- round(stats.sig[i,g0.1]/stats.sig[i,g0],0) | |
} | |
colnames(stats.sig)[g0.1] <- "mean_positive_group0" | |
for(i in 1:dim(stats.sig)[1]){ | |
stats.sig [i,g1.1] <- round(stats.sig[i,g1.1]/stats.sig[i,g1],0) | |
} | |
colnames(stats.sig)[g1.1] <- "mean_positive_group1" | |
stats.sig$percent_positive_group1 <- round(stats.sig[,g1]/N1*100,1) | |
stats.sig$percent_positive_group0 <- round(stats.sig[,g0]/N2*100,1) | |
#shift these columns to begining of dataframe | |
stats.sig <- stats.sig[,c(ncol(stats.sig),(ncol(stats.sig)-1),3:ncol(stats.sig)-2)] | |
cat(paste("There were ",dim(stats.sig)[1],"OTUs/ASVs significantly different between",levels[[1]],"vs.",levels[[2]], | |
"that met",'\n',"threshold criteria of p",p,"absolute FC",FC,"and percentage presence in at least one group of",100*perc,"% \n")) | |
} | |
#------------------- | |
#write to file | |
#SELECT FILENAME | |
#--------------------------------------------------------- | |
#WRITE RESULTS TO FILE | |
file = paste0(outDir,"/",FileName,".csv") | |
#change 'NA' column to coeff. | |
print("writing results and model to file") | |
write.table(stats.sig,file, sep =",", col.names = NA) | |
save(stats.sig, file=paste0(outDir,"/",FileName,".RData")) | |
#append model | |
finalMod=fit@fit$design | |
suppressWarnings(write.table(finalMod,file,append=TRUE , sep =",", col.names = NA))#append model to file | |
#--------------------------------- | |
#GENERATE HEATMAPS | |
#--------------------------------- | |
#check if any otus to plot | |
if(dim(stats.sig)[1]<=1){ | |
stop("At least two signficant results are required for heatmap plotting") | |
} | |
#parameter specifications | |
file = paste0(outDir,"/",FileName,"_",heatmap.descriptor,".pdf") | |
print(file) | |
#annotation columns to plot on top of heatmap | |
cols = c(factor,extra.cols) | |
#first standardize physeq for use in heatmap | |
total = median(sample_sums(physeq)) | |
standf = function(x, t=total) round(t * (x / sum(x))) | |
p.std = transform_sample_counts(physeq, standf) | |
#manually sort columns by group? | |
if(ordered==TRUE){ | |
heatmap.k(physeq= p.std, data.subset = stats.sig, subset.samples = rownames(mod), #manual sort columns | |
annot.cols = cols,main = main, rows.sortby = rows.sortby, subt=subt,filename = file, order.by = factor, | |
Colv = FALSE, | |
colours=colours, labrow = TRUE, cexCol=cexCol, cexRow=cexRow, merged = merged) | |
}else{heatmap.k(physeq= p.std, data.subset = stats.sig, subset.samples = rownames(mod), #cluster columns | |
annot.cols = cols,main = main, rows.sortby = rows.sortby,subt=subt,filename = file, | |
colours=colours, labrow = TRUE,cexCol=cexCol, cexRow=cexRow, merged = merged)} | |
print("making heatmap of results") | |
suppressWarnings(warning("super.fitZig.kv")) | |
return(stats.sig) | |
} | |
#********************************** | |
#corr.k(): FUNCTION TO PERFORM PAIRWISE CORRELATIONS, TEST FOR SIGNIFICANCE AND PLOT CORRELLOGRAM | |
#********************************** | |
#ARGUMENTS: | |
#mat1: matrix 1 (columns of mat1 to be correlated with columns of mat2) | |
#mat2: matrix 2 | |
#p: adjusted p-value to sue as cutoff | |
#use: pairwise correlations? If yes: "pairwise" if not | |
#pdf.height: specify height of correlogram .pdf in inches, default=7 | |
#pdf.width: specify width of correlogram .pdf in inches, default=7 | |
#filename for correlogram | |
#outDir: desired output directory, default = working directory | |
corr.k <- function(mat1, mat2, p=0.05, use = "pairwise", adjust = "BH", r = 0.3, filename, rectangles = NULL, pdf.height=NULL, pdf.width=NULL){ | |
cor.mat = cor(mat1, mat2) | |
cor.p <- corr.test(mat1,mat2,use = use,adjust=adjust)#MTC adjustment with corr.test from package 'psych' | |
#NOW REPORT SIGNIFICANT ENTRIES | |
#If mat1=mat2, remove diagonals before counting | |
if(identical(mat1,mat2)==TRUE){ | |
diag(cor.p$p) <- 1 | |
diag(cor.mat) <-0 | |
} | |
sig_p <- length(which(cor.p$p <= p)) | |
print(paste("There were",sig_p,"significant correlations after MTC")) | |
#also filter on the magnitude of correlation (? 0.3) | |
sig_p_2 <- length(which(cor.p$p <= p & cor.mat >= r)) | |
print(paste("There were",sig_p_2,"significant correlations after MTC with R >= ",r)) | |
#------------------------------ | |
#GET CONFIDENCE INTERVALS | |
CIs = na.omit(cor.p$ci[abs(cor.p$ci$r) >= r & cor.p$ci$p <=p, ]) | |
#write CIs to file | |
write.csv(CIs,file = paste0(outDir,"/",filename,"CIs.csv")) | |
#------------------------------ | |
test.mat <- cor.mat | |
test.mat[which(cor.p$p > p | cor.mat < r)] <- NaN | |
sig.p <- cor.p$p | |
#exclude columns and rows with only NAs | |
na.cols <- apply(test.mat,2,function(x) all(is.na(x))) | |
na.rows <- apply(test.mat,1,function(x) all(is.na(x))) | |
test.mat <- test.mat[,!na.cols] | |
test.mat <- test.mat[!na.rows,] | |
sig.mat <- cor.mat | |
sig.mat <- sig.mat[rownames(test.mat),colnames(test.mat)] | |
sig.p <- sig.p[rownames(test.mat),colnames(test.mat)] | |
if(dim(sig.mat)[2]<=1){ | |
return(sig.mat) | |
} | |
# | |
upper = cor.p$ci$upper | |
upper.mat = matrix(upper, dim(cor.p$p)[1],dim(cor.p$p)[2])# | |
dim(upper.mat) | |
rownames(upper.mat) <- rownames(cor.p$p) | |
colnames(upper.mat) <- colnames(cor.p$p) | |
#subset to match sig.mat | |
dim(upper.mat) | |
upper.mat <- upper.mat[rownames(sig.mat),colnames(sig.mat)] | |
dim(upper.mat) | |
lower = cor.p$ci$lower | |
lower.mat = matrix(lower, dim(cor.p$p)[1],dim(cor.p$p)[2])# | |
rownames(lower.mat) <- rownames(cor.p$p) | |
colnames(lower.mat) <- colnames(cor.p$p) | |
#subset to match sig.mat | |
lower.mat <- lower.mat[rownames(sig.mat),colnames(sig.mat)] | |
#------------------------------ | |
#now sort by direction of correlation for ease of interpretation | |
if(identical(mat1,mat2)==FALSE){ | |
o.c <- colnames(sig.mat)[order(apply(sig.mat,2,mean), decreasing=FALSE)] | |
o.r <- rownames(sig.mat)[order(apply(sig.mat,1,function(x) sum(abs(x))), decreasing=FALSE)] | |
sig.mat <- sig.mat[o.r,o.c] | |
sig.p <- sig.p[o.r,o.c] | |
dim(sig.p) | |
#PLOT RESULTS AS CORRELOGRAM | |
pdf(paste0(outDir,"/",filename,".pdf"),width=pdf.width, height=pdf.height)# | |
corrplot(sig.mat, method='circle',p.mat = sig.p,sig.level=p, | |
insig = "blank",tl.cex=0.7,tl.col = "black", cl.cex = 0.7) | |
dev.off() | |
#REORDER CI MATRICES | |
lower.mat <- lower.mat[rownames(sig.mat),colnames(sig.mat)] | |
upper.mat <- upper.mat[rownames(sig.mat),colnames(sig.mat)] | |
#now plot with MTC and CIs: | |
pdf(paste0(outDir,"/",filename,"p.adj_",p,"_CIs.pdf"),width = pdf.width, height = pdf.height)# | |
corrplot(sig.mat, method='circle',p.mat = sig.p,sig.level=p,low = lower.mat, upp = upper.mat, | |
rect.col = "navy",plotC="rect",cl.pos ="n", | |
insig = "blank",tl.cex=0.7,tl.col = "black", cl.cex = 0.7) | |
dev.off() | |
}else{#PLOT RESULTS AS SQUARE CORRELOGRAM WITH HIERARCHICAL CLUSTERING | |
pdf(paste0(outDir,"/",filename,".pdf"), width=pdf.width, height=pdf.height)# | |
corrplot(sig.mat, method='circle',p.mat = sig.p,sig.level=p, | |
insig = "blank",order = "hclust",addrect = rectangles,tl.cex=0.7,tl.col = "black", cl.cex = 0.7) | |
dev.off() | |
#with CIs | |
#now plot with MTC and CIs: | |
pdf(paste0(outDir,"/",filename,"p.adj_",p,"_CIs.pdf"), width = pdf.width, height = pdf.height)# | |
corrplot(sig.mat, method='circle',p.mat = sig.p,sig.level=p,low = lower.mat, upp = upper.mat, | |
rect.col = "navy",plotC="rect",cl.pos ="n", | |
insig = "blank",order = "hclust",addrect = rectangles,tl.cex=0.7,tl.col = "black", cl.cex = 0.7) | |
dev.off() | |
} | |
} | |
#********************************** | |
#MC.summary(): take the output from PICRUSt's metagenome_contributions.py, together with taxonomic annotation for the OTUs included in | |
#this table and provides a summary of the contribution of each Family/Genus.. etc to ONE SPECIFIC KEGG gene e.g. K02030 | |
#********************************** | |
#ARGUMENTS: | |
#metagenome.contributions = output from PICRUSt's metagenome_contributions.py (imported as a data frame) | |
#KEGG.gene: KEGG gene of interest e.g. "K02030" | |
#samples: list of sample names to be included | |
#level: desired tax level for summary e.g. "Family", "Genus" depending on the column names of your tax table | |
#outDir: output directory | |
#file.ext: descriptive file extension such as "Family_K02030" | |
#tax.table: taxonomic table for OTUs of interest | |
MC.summary = function(metagenome.contributions, KEGG.gene, tax.table,samples, level,outDir, file.ext){ | |
out = c() | |
for (i in 1:length(samples)) {#for each sample, summarise each family's contribution to the KEGG gene of interest | |
Kc = metagenome.contributions[metagenome.contributions$Gene == KEGG.gene,]#subset input by KEGG gene of interest | |
this.data=Kc[Kc$Sample==samples[i],]#get data for that specific sample | |
otus=this.data[,3]#get all otus for this sample and this gene | |
this.tax.table=tax.table[as.character(unique(otus)),]#get corresponding taxonomy for these OTUs. | |
tot.taxa=sum(this.data[,7] )#get the percentage contributed by all OTUs in sample i for this specific gene, which should = 1 | |
taxa=unique(this.tax.table[,level] )#get list of unique taxa at required tax level | |
taxa[taxa==" "] = "other"#for taxa with no family level annotation - these will all be summarised together as 'other' | |
for (j in 1:length(taxa) ) { | |
map2fam = rownames(this.tax.table)[this.tax.table[,level] == taxa[j]]#get OTU IDs for a particular family | |
taxa.tot=sum(this.data[this.data$OTU %in% map2fam,7] )/tot.taxa#get the total percent contribution for each family to this specific gene for sample i | |
line = t(c(as.character(samples[i]), taxa[j], taxa.tot )) | |
out = rbind(line,out) | |
} | |
print(i) | |
} | |
#write results to file | |
colnames(out) = c("Sample","Taxa","Contribution") | |
out = data.frame(out) | |
out$Contribution = as.numeric(as.character(out$Contribution)) | |
write.table(out,file=paste0(outDir,file.ext,'.txt'), quote=FALSE,sep = "\t", col.names=TRUE, row.names=FALSE) | |
return(out) | |
} | |
#********************************** | |
#RF.k(): performs random forests analysis on the otu table of a supplied phyloseq object. | |
#NB: this function is only setup for two-class categorical response variables NOT regression on continuous response variables | |
#The data is randomly divided into a training (two thirds of the data) and test set (remaining one third of the data not used for training) | |
#results printed to screen include most important taxa, AUC, PPV, NPV | |
#Option to specify the top 'x' taxa to see how they perform | |
#********************************** | |
#ARGUMENTS: | |
#data = phyloseq object of interest | |
#var = variable of interest sample_data(data) column name | |
#outDir = output directory for results | |
#train.control: TRUE or FALSE, should RF algorithm parameters be tuned by means of cross-validation (THIS INCREASE RUN TIME, DEFAULT=FALSE) | |
#cv.fold = cross-validation fold for tuning the RF model (default=10) | |
#cv.repeat = For repeated k-fold cross-validation only: the number of complete sets of folds to compute | |
#Nfeatures.validation: 'x' number of top taxa to test (e.g. how good are the top 3 most important taxa at classifying) | |
#outDir = output directory for results | |
#testAndtrain: should the dataset be divided into training (randomly select two thirds of the data) and test sets (the remaining one third of the data) | |
#testAndtrain valid options: TRUE | FALSE | |
#descriptor: any additional description that needs to go in the file name (e.g. 'merged_otus') | |
#np: number of top features to display in table and variable importance plot | |
#positive.class: the positive class needs to be specified - this should match one of the levels() under 'var' of interest e.g: | |
#for the variable "TB status" the positive class might be "TBpos" - this is used to calculate PPV, NPV | |
#train.fraction: what fraction of samples should be used for training if testAndtrain=TRUE (the remainder will be used for testing) | |
RF.k <- function(data, seed = 2,var, cv.fold=10, train.control=FALSE,cv.repeat=3,outDir,Nfeatures.validation=NULL, testAndtrain=FALSE,np=50, | |
positive.class,descriptor = c(),train.fraction=2/3,...){#function default values supplied, change as required | |
summary_file <- paste0(outDir,"/RF",var,"_results_",cv.fold,"_", Nfeatures.validation,"_",seed,descriptor,".txt") | |
write(print(paste(length(which(is.na(sample_data(data)[,var]))),"samples did not have response variable data, removing these...")), | |
file = summary_file,sep = "\t") | |
sub.index <- which(!is.na(sample_data(data)[,var]))# | |
data <- prune_samples(sample_names(data)[sub.index],data) | |
if(testAndtrain==TRUE){ | |
table = otu_table(data) | |
train.n = round(train.fraction*nsamples(data)) | |
#generate random integers between 1 and nsamples(phy.temp) | |
set.seed(seed) | |
train.rows=createDataPartition(y=unlist(sample_data(data)[,var]), p=train.fraction, list=FALSE)#randomly select desired fraction of samples for training and testing while balancing class distributions within splits | |
train = table[,train.rows] | |
test = table[,-train.rows] | |
tbl <- table(sample_data(data)[colnames(train),var]) | |
write(print(paste("Training set size: ",dim(train)[2], "samples","with",tbl[1],"and",tbl[2], "samples per class")), | |
file = summary_file,sep = "\t", append=T) | |
tbl <- table(sample_data(data)[colnames(test),var]) | |
write(print(paste("Test set size: ",dim(test)[2], "samples","with",tbl[1],"and",tbl[2], "samples per class")), | |
file = summary_file,sep = "\t", append=T) | |
#TRAINING - RANDOMLY SELECT 'N=training.fraction' OF DATASET | |
# Make a dataframe of training data with OTUs as column and samples as rows | |
predictors <- t(train) | |
predictors = data.frame(predictors) | |
#------------------ | |
# Make one column for our outcome/response variable | |
response <- as.factor(unlist(sample_data(data)[train.rows,var])) | |
names(response) <- sample_names(data)[train.rows] | |
} | |
else if(testAndtrain==FALSE){ | |
table = otu_table(data) | |
tbl <- table(sample_data(data)[,var]) | |
write(print(paste("Data set size: ",tbl[1]+tbl[2], "samples","with",tbl[1],"and",tbl[2], "samples per class")), | |
file = summary_file,sep = "\t", append=T) | |
# Make a dataframe of training data with OTUs as column and samples as rows | |
predictors <- t(table) | |
predictors = data.frame(predictors) | |
#------------------ | |
# Make one column for our outcome/response variable | |
response <- as.factor(unlist(sample_data(data)[,var])) | |
names(response) <- sample_names(data) | |
} | |
#------------------------------------ | |
#BUILD RF MODEL | |
# Combine them into 1 data frame | |
rf.data <- data.frame(response, predictors) | |
head(rf.data) | |
set.seed(seed) | |
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
write(print("******************************************************************************"),file = summary_file,sep = "\t", append=T) | |
write(print("Building RF model on training set....."),file = summary_file,sep = "\t", append=T) | |
if(train.control==TRUE){ | |
write(print(paste("Performing model tuning:",cv.repeat,"round(s) of",cv.fold,"fold cross-validation")),file = summary_file,sep = "\t", append=T) | |
write(print("******************************************************************************"),file = summary_file,sep = "\t", append=T) | |
classify<-train(response~.,data=rf.data,method="rf", | |
trControl=trainControl(method="repeatedcv",repeats=cv.repeat,number=cv.fold), metric="Accuracy", tuneLength=10) | |
print(classify) | |
} | |
else if(train.control==FALSE){ | |
write(print("Performing model tuning.."),file = summary_file,sep = "\t", append=T) | |
write(print("******************************************************************************"),file = summary_file,sep = "\t", append=T) | |
classify<-train(response~.,data=rf.data,method="rf", metric="Accuracy") | |
print(classify) | |
} | |
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
Cmat = confusionMatrix(data = classify$finalModel$predicted, reference = rf.data$response, positive=positive.class) | |
print(Cmat) | |
sink(summary_file, append=T) | |
print(Cmat) | |
sink() | |
#------------------------------------------ | |
print("******************************************************************************") | |
write(print("******************************************************************************"),file = summary_file,sep = "\t", append=T) | |
#------------------------------------------ | |
#RF CV for feature selection: | |
rf.cv = rfcv(rf.data[,2:dim(rf.data)[2]], rf.data[,1], cv.fold=cv.fold) | |
write(print("Cross-validated error rates associated with stepwise reduction of features:"),file = summary_file,sep = "\t", append=T) | |
print(rf.cv$error.cv) | |
sink(summary_file, append=T) | |
print(rf.cv$error.cv) | |
sink() | |
pdf(paste0(outDir,"/RF_elbow_plot_",var,"_",cv.fold,"_",Nfeatures.validation,"_",seed,descriptor,".pdf")) | |
with(rf.cv, plot(n.var, error.cv, log="x", type="o", lwd=2)) | |
dev.off() | |
#---------------------------------- | |
# Make a data frame with predictor names and their importance | |
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
imp=importance(classify$finalModel) #When we use metric="Accuracy" without importance=TRUE in the model build train() we get the same results we would've when using the randomForest package (meanDecreaseGini) | |
imp <- data.frame(predictors = rownames(imp), imp) | |
# Order the predictor levels by importance | |
imp.sort <- arrange(imp, desc(MeanDecreaseGini)) | |
imp.sort$predictors <- factor(imp.sort$predictors, levels = imp.sort$predictors) | |
# Select the top n predictors | |
imp.n <- imp.sort[1:np, ] | |
#add taxonomy info | |
rownames(imp.n) = imp.n[,1] | |
labs = tax.lab(imp.n, data, labrow=TRUE,merged=FALSE) | |
imp.n$tax = labs | |
imp.n$tax = factor(imp.n$tax, levels = imp.n$tax)#avoid ggplot sorting alphabetically\ | |
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
write(print("******************************************************************************"),file = summary_file,sep = "\t", append=T) | |
write(print(paste("THE TOP",np," MOST IMPORTANT FEATURES WERE:")),file = summary_file,sep = "\t", append=T) | |
print(imp.n) | |
sink(summary_file, append=T) | |
print(imp.n) | |
sink() | |
write(print("******************************************************************************"),file = summary_file,sep = "\t", append=T) | |
p <- ggplot(imp.n,aes(x = imp.n[,"tax"], y = imp.n[,"MeanDecreaseGini"]),environment = environment())+ | |
geom_bar(stat = "identity", fill = "grey") + | |
coord_flip() + | |
ggtitle(paste("Most important taxa for classifying samples by ",var))+ | |
theme_bw() + | |
theme(panel.background = element_blank(), | |
panel.grid.major = element_blank(), #remove major-grid labels | |
panel.grid.minor = element_blank(), #remove minor-grid labels | |
plot.background = element_blank()) | |
pdf(paste0(outDir, "/RFs",var,"variable_importance_plot_",cv.fold,"_",Nfeatures.validation,"_",seed,descriptor,".pdf"))#note: ggplot doesn't plot within a function - this is a workaround using print() | |
print(p) | |
dev.off() | |
#------------------------------------ | |
#SEE HOW THE TOP X NUMBER OF FEATURES DO - SPECIFY IN 'Nfeatures.validation' PARAMETER | |
if(length(Nfeatures.validation)!=0){ | |
#sort by mean decrease in Gini index and select top 'x' | |
goodPredictors = imp.n[1:Nfeatures.validation,1] | |
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
classify.x<-train(response~.,data=rf.data[,c(as.character(goodPredictors),"response")],method="rf",metric="Accuracy") | |
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
write(print("******************************************************************************"),file = summary_file,sep = "\t", append=T) | |
write(print(paste("TRAINING SET classification summary if using the TOP",Nfeatures.validation, "features only")),summary_file,sep = "\t", append=T) | |
write(print("******************************************************************************"),file = summary_file,sep = "\t", append=T) | |
write(print(paste("Feature(s) selected:",goodPredictors)),file = summary_file,sep = "\t", append=T) | |
print(classify.x$finalModel) | |
#confusion matrix: | |
Cmat.x = confusionMatrix(data = classify.x$finalModel$predicted, reference = rf.data$response, positive=positive.class) | |
print(Cmat.x) | |
sink(summary_file, append=T) | |
print(Cmat.x) | |
sink() | |
} | |
if(testAndtrain==TRUE){ | |
#--------------------------------------- | |
#NOW TEST CLASSIFIER IN TEST COHORT (I.E. REMAINING 1/3 OF DATASET) USING THE TOP X FEATURES | |
#--------------------------------------- | |
test <- t(test) | |
test = data.frame(test) | |
if(length(Nfeatures.validation)!=0){ | |
rf.test = classify.x | |
} | |
else{rf.test = classify} | |
#Predict classes based on model built on training data (reduced features set) | |
predict.test = predict(rf.test, test) | |
#Get actual classes for response variable, test samples | |
test.response <- as.factor(unlist(sample_data(data)[-train.rows,var])) | |
names(test.response) <- sample_names(data)[-train.rows] | |
#identical(names(test.response),rownames(test)) | |
Cmat.test=confusionMatrix(data=predict.test, reference=test.response, positive=positive.class) | |
write(print("******************************************************************************"),file = summary_file,sep = "\t", append=T) | |
write(print(paste("TEST SET model testing (",round((1-train.fraction)*100) ,"% of the data )")),file = summary_file,sep = "\t", append=T) | |
if(length(Nfeatures.validation)!=0){ | |
write(print(paste("using the top",Nfeatures.validation,"features")),file = summary_file,sep = "\t", append=T) | |
} | |
else{write(print("using all features"),file = summary_file,sep = "\t", append=T) | |
} | |
write(print("******************************************************************************"),file = summary_file,sep = "\t", append=T) | |
print(Cmat.test) | |
sink(summary_file, append=T) | |
print(Cmat.test) | |
sink() | |
#--------------------------------- | |
} | |
} | |
Hi Katie
The heat map looks great after removing the the PatientIDs. Thank you for your prompt response and help
Best
Jai
Hi Katie
I am having trouble with-
- superfitZig.kv function returns an error- [1] "0 of 96 samples were removed due to missing data"
Error in cumNormStatFast(obj) : Warning sample with one or zero features
Called from: cumNormStatFast(obj)
I am confused as to why the function is aborted. I filtered the data -
#Remove ASVs that are only present in Group"None" i.e. subset of samples have zero count
length(which(rowSums(otu_table(phy.temp))==0)) #0
keep <- which(rowSums(otu_table(phy.temp)) !=0)
#Remove empty rows
phy.temp <- prune_taxa(names(keep), phy.temp)
ntaxa(phy.temp) #75
I have looked at the data and cannot find any zero counts. Any advice on how to troubleshoot and fix this?
Appreciate your help
Best
Jai
Hi Katie
I am currently using the custom_microbiome tutorial you develop.
I am having trouble with the custom function.
M.phy <- tax_glom.kv(M.f)
[1] "Removing phylogenetic tree"
Error in h(simpleError(msg, call)) :
error in evaluating the argument 'x' in selecting a method for function 'print': error in evaluating the argument 'physeq' in selecting a method for function 'ntaxa': object 'phy.new' not found
In addition: Warning message:
In [<-
(*tmp*
, i, value = phy.k.merged) :
implicit list embedding of S4 objects is deprecated
Called from: h(simpleError(msg, call))
Browse[1]>
Any lead on how to solve this ?
Hi @Hermandez
It's hard to say, you can send me your phyloseq object if you'd like and I can have a look at what's going wrong.
Regards,
Katie.
Hi Katie.
Let me do so via email.
Thank you.
Hello Katie,
Happy new year.
I have the following errors while using the microbiome_custom_functions
1)
hm = heatmap.k(physeq= M.f, annot.cols = c(7,10), main = main,filename = f,colours=colours,Colv = sample.order,labrow = TRUE,scale = 1)
[1] "including all samples"
[1] "including all otus"
Error during wrapup: error in evaluating the argument 'object' in selecting a method for function 'sample_data': undefined columns selected
Error: no more error handlers available (recursive errors?); invoking 'abort' restart
barplot = bar.plots(physeq = M.std,cat = "AMF",level = level, count = count, perc = perc, filen = 'Barplots_by_AMF')
Error in h(simpleError(msg, call)) :
error in evaluating the argument 'object' in selecting a method for function 'sample_data': undefined columns selected
Called from: h(simpleError(msg, call))
Hi Katie,
I am having trouble when trying to cluster/merge features.
M.phy <- tax_glom.kv(M.std)
And I am not able to understand the error displayed.
[1] "Removing phylogenetic tree"
Error in h(simpleError(msg, call)) :
error in evaluating the argument 'x' in selecting a method for function 'print': error in evaluating the argument 'physeq' in selecting a method for function 'ntaxa': object 'phy.new' not found
In addition: Warning message:
In [<-
(*tmp*
, i, value = phy.k.merged) :
implicit list embedding of S4 objects is deprecated
Called from: h(simpleError(msg, call))
Browse[1]>
Appreciate your help
Best
Hermandez
Hi Katie,
A kind reminder of my issue
Hi @Hermandez I don't get this error for the tax_glom.kv() function. You will need to email your data objects if I am to figure this out
Hi @kviljoen, I have not heard from you. I hope you received the data as shared.
HI @Hermandez apologies for the delayed response I'm not finding the email you sent, please can you resend the data?
Regards,
Katie.
Hi Katie, I just resent the data. Kindly confirm.
@Hermandez received, thanks.
Hi Katie,
I want to analyse the reads separately, Forward (F) and Reverse (R).
Allow me to send you a phyloseq object file for each.
Yves based on the data that you sent it is unclear what you are using as input for the tax_glom.kv function which accepts a phyloseq object. Also not sure why you have F and R otu and taxonomy tables, you should only have one each after merging reads during preprocessing?
…
On Wed, Nov 9, 2022 at 8:27 AM Hermandez @.> wrote: @.* commented on this gist. ------------------------------ Hi Katie, I just resent the data. Kindly confirm. — Reply to this email directly, view it on GitHub https://gist.github.com/97d36c689c5c9b9c39939c7a100720b9#gistcomment-4363184 or unsubscribe https://github.com/notifications/unsubscribe-auth/ACGSZMCHSTXSYSFVWAW3M53WHM76VBFKMF2HI4TJMJ2XIZLTSKBKK5TBNR2WLJDHNFZXJJDOMFWWLK3UNBZGKYLEL52HS4DFQKSXMYLMOVS2I5DSOVS2I3TBNVS3W5DIOJSWCZC7OBQXE5DJMNUXAYLOORPWCY3UNF3GS5DZVRZXKYTKMVRXIX3UPFYGLK2HNFZXIQ3PNVWWK3TUUZ2G64DJMNZZDAVEOR4XAZNEM5UXG5FFOZQWY5LFVA4TANBSGYYDKNNHORZGSZ3HMVZKMY3SMVQXIZI . You are receiving this email because you authored a thread. Triage notifications on the go with GitHub Mobile for iOS https://apps.apple.com/app/apple-store/id1477376905?ct=notification-email&mt=8&pt=524675 or Android https://play.google.com/store/apps/details?id=com.github.android&referrer=utm_campaign%3Dnotification-email%26utm_medium%3Demail%26utm_source%3Dgithub .
-- Katie Bioinformatician Division of Computational Biology Institute of Infectious Diseases & Molecular Medicine University of Cape Town South Africa +27 21 406 6176 @.***>
I just resent the data as used to create the phyloseq object and the R Studio script
Thank You Katie. It works this time.
I am having trouble figuring out this error.
diss <- distance(merged.stdF,method = "bray", type = "samples")
#Error
Error in distance(merged.stdF, method = "bray", type = "samples") :
unused arguments (method = "bray", type = "samples")
Hi there,
You're welcome. Is this the figure that appeared in your RStudio window or the saved .pdf? It looks like you have quite a lot of patient IDs. Try excluding those from the figure legend (don't select as part of annot.cols) and see if that works? Otherwise the legend crowds the figure if it has too many entries..
Katie.