diff --git a/V1.7.1/ODA_1.7.1.R b/V1.7.1/ODA_1.7.1.R new file mode 100755 index 0000000000000000000000000000000000000000..60955fe7008eeca4a620ddd6d3cea3e01e9fe52d --- /dev/null +++ b/V1.7.1/ODA_1.7.1.R @@ -0,0 +1,1075 @@ +verNum='V1.7.1' +#This script was developed by zhiyu.zhao@utsouthwestern.edu . +#v1.7.1, 12/04/2019: Figures were saved as both .ps and .png files. Euler plot was disabled for >6 samples. Samples / features can be clustered or not on the heatmaps. +#v1.7.0, 10/22/2019: Figures were saved as .ps files in a Figure folder. Correlation plot was generated with the old method for better speed. Euler plot was disabled for >10 samples. Y axis of volcano plot could be raw or adjusted p-values. DESeq was renamed RLE as I would implement the DESeq differential expression analysis in the future. RLE is the same method used by DESeq but I slightly modified it to allow for zeros, missing values and / or values below the value filter. Due to this reason I couldn't use the DESeq normalization directly. +#v1.6.9, 10/4/2019: A bug regarding cpglm was fixed. The bug caused an error in cpglm fitting but the class(fit) was still 'cpglm' and fit$converged was still TRUE. Visualization option was added. Version number checkup was added for the data template. Multiple other minor improvements. +#v1.6.8, 09/19/2019: A bug regarding featureDescription was fixed. The bug was present only when data format was "table with feature description" and samples were excluded which happened to cause some features to be all-empty so that those features needed to be excluded as well. +#v1.6.7, 08/29/2019: Correlation plot was enchanced. Euler diagram was added. Software version number was reported in the result file. +#v1.6.6, 08/28/2019: Data table with feature description column was enabled, minor bugs fixed, and volcano plot colors customized. +#v1.6.5, 08/27/2019: A bug regarding number of features per sample when data imputation = "NA" was fixed. +#v1.6.4, 08/08/2019: A bug regarding groupMeanLowerBound was fixed. Definition and default value of groupCountFilter were modified. Data imputation was implemented. Missing values were allowed. +#v1.6.3, 08/05/2019: A bug regarding nSamples and formula string generation (only when technical replicates are present) was fixed. +#v1.6.2, 07/24/2019: A bug regarding normalization size factors was fixed. The bug caused size factors to be applied to incorrect samples. If you have used versions before this fix, please rerun your analysis using this fixed version. +#To run this program, first install R (version 3.5.1 was tested by the author), then run the following from the command line of your operating system. + +#Rscript path1/this_program.R path2/your_input_file.xlsm path3/your_output_file.xlsx #Depending on your operating system, / can be \. + +#Use the data_template_1.6.xlsm file accompanying this program to generate your_input_file.xlsm. All the parameters can be changed inside that file. Please read the instructions there. Excel macros need to be allowed. + +#############################Begin Reading Arguments############################# +if (1) { #Set to 1 for actual files or 0 for debugging a test.xlsm in the R console. + args=commandArgs(trailingOnly=T) + if (length(args)!=2) + stop('Please specify input and output file names.') + inputFile=args[1] + outputFile=args[2] +} else { + inputFile='test.xlsm' + outputFile='Result_of_test.xlsx' +} +if (!file.exists(inputFile)) + stop('Input file is not found.') +if (file.exists(outputFile)) + stop('Output file exists.') +figuresPath=paste0(tools::file_path_sans_ext(outputFile),'_Figures') +if (file.exists(figuresPath)&&!rlang::is_empty(dir(figuresPath))) { + stop('Figures folder is not empty.') +} else { + dir.create(figuresPath) +} +#############################End Reading Arguments############################# + +#############################Begin Installing/Loading Libraries############################# +myLibPath='~/R/3.5.1-gccmkl' #Specify your own library path here. +dir.create(myLibPath,recursive=T,showWarnings=F) +.libPaths(c(myLibPath,.libPaths())) +#This program was tested under R version 3.5.1 with the following required libraries. +if (!require(openxlsx)) { + install.packages('openxlsx',lib=myLibPath,repos='https://cran.r-project.org/') + library(openxlsx) +} +if (!require(data.table)) { + install.packages('data.table',lib=myLibPath,repos='https://cran.r-project.org/') + library(data.table) +} +if (!require(gtools)) { + install.packages('gtools',lib=myLibPath,repos='https://cran.r-project.org/') + library(gtools) +} +if (!require(matrixStats)) { + install.packages('matrixStats',lib=myLibPath,repos='https://cran.r-project.org/') + library(matrixStats) +} +if (!require(cplm)) { + install.packages('cplm',lib=myLibPath,repos='https://cran.r-project.org/') + library(cplm) +} +if (!require(ggplot2)) { + install.packages('ggplot2',lib=myLibPath,repos='https://cran.r-project.org/') + library(ggplot2) +} +if (!require(cowplot)) { + install.packages('cowplot',lib=myLibPath,repos='https://cran.r-project.org/') + library(cowplot) +} +if (!require(pheatmap)) { + install.packages('pheatmap',lib=myLibPath,repos='https://cran.r-project.org/') + library(pheatmap) +} +if (!require(ggcorrplot)) { + install.packages('ggcorrplot',lib=myLibPath,repos='https://cran.r-project.org/') + library(ggcorrplot) +} +if (!require(eulerr)) { + install.packages('eulerr',lib=myLibPath,repos='https://cran.r-project.org/') + library(eulerr) +} +if (!require(GGally)) { + install.packages('GGally',lib=myLibPath,repos='https://cran.r-project.org/') + library(GGally) +} +#############################End Installing/Loading Libraries############################# + +#############################Begin Main############################# + +#############################Begin Default Settings############################# +#Below are default settings. DO NOT change them here. +#If you use the data_template.xlsm file accompanying this program to generate your input, all the parameters can be changed inside your_input_file.xlsm . Please read the instructions there. +dataFormat='list' #Available options: list; table. +valueFilter=0 #A cutoff-value used to count the number of features in a sample. If feature value <= valueFilter, the feature is taken as absent from the sample. +groupCountFilter=3 #A cutoff-value used to count the number of features in a group. If the number of samples < groupCountFilter for a feature, the feature is taken as absent from the group. +groupMeanLowerBound=0 #A cutoff-value used as group mean when it's lower than the cutoff. This is for the adjustment of fold change when a group has a very low mean value. +pCutoffs=0.05 #P-value cutoff for the differential expression (DE) analysis. Multiple values are allowed. +fcCutoffs=2 #Fold change cutoff for the DE analysis. Multiple values are allowed. +batchEffectReduction=F #Whether to model batch effect. Only available for linear models such as cpglm and glm. +averagingTechReps=F #Whether to average technical replicates before any analysis steps. +normalization='RLE' #Available options: No; RLE; BySize; ByMedian; BySum; ByUpperQuantile. +uqProb=0.75 #If normalization=ByUpperQuantile, this value is used as the quantile cutoff. +logTransform=T #Whether to log2-transform the data for statistical tests. +logOffset=1 #An offset-value added to the data before the log-transformation. +statTest='glm' #Statistical test method for DE. Available options: t-test; cpglm-glm; glm. +controlGroups=NULL #Control group name(s) for DE. A control group will be tested against every other group. +dePairs=NULL #Group name pair(s) for DE. Each pair of group names is a DE comparison. +interestedFeatures=NULL #Interested features to be visualized separately. +excludedFeatures=NULL #Features to be completed excluded from the analysis. +dataImputation='half minimum' #Method to impute missing values and those less than or equal to the value filter. It affects statistical tests but not visualizations. See data template for details. +visual=T #Whether to visualize data using graphs. +volcanoP='adjusted p-value' #Which p-value as the y-axis of volcano plots. +hclustSamples=T #Whether to cluster samples on the heatmap. +hclustFeatures=T #Whether to cluster features on the heatmap. +colors=c('Red','Yellow','Blue','Orange','Green','Purple','Magenta','Cyan','Gray','Black') +heatmapColors=colorRampPalette(c('green','black','red'))(256) +#############################End Default Settings############################# +message('###################################################################') +message('Analysis started ...') +#############################Begin Settings############################# +message('Reading input settings ...') +sheetNames=getSheetNames(inputFile) +if ('Samples' %in% sheetNames) { + sampleIDs=read.xlsx(inputFile,sheet='Samples',colNames=T,rowNames=T,skipEmptyRows=T,skipEmptyCols=T,cols=c(1:5)) + if (!is.null(sampleIDs)&&dim(sampleIDs)[1]>0) { + for (i in 1:dim(sampleIDs)[1]) + if (grepl('\\s',rownames(sampleIDs)[i])) + stop(paste0('Sample ',i,': ',rownames(sampleIDs)[i],' has white space(s). Please remove them.')) + sampleIDs=sampleIDs[!is.na(sampleIDs[,1]),1:4] #Excluding samples without group ID. + } + nSamples=dim(sampleIDs)[1] + if (nSamples<2) + stop('At least 2 samples should have group IDs.') + if (sum(!is.na(sampleIDs[,2]))==0) #No biological replicate IDs. Automatic IDs will be generated. + sampleIDs[,2]=c(1:nSamples) + else if (sum(!is.na(sampleIDs[,2]))!=nSamples) + stop('If you provide biological replicate IDs, they should be present for all samples.') + if (sum(!is.na(sampleIDs[,3]))==0) #No batch IDs. Automatic IDs will be generated. + sampleIDs[,3]='Batch1' + else if (sum(!is.na(sampleIDs[,3]))!=nSamples) + stop('If you provide batch IDs, they should be present for all samples.') + sampleIDs[,1]=as.factor(sampleIDs[,1]) + sampleIDs[,2]=as.factor(sampleIDs[,2]) + sampleIDs[,3]=as.factor(sampleIDs[,3]) + sampleIDs[,4]=as.numeric(sampleIDs[,4]) +} else + stop('Sample sheet is missing.') +if ('Parameters' %in% sheetNames) { + if (read.xlsx(inputFile,sheet='Parameters',colNames=F,rows=1,cols=1)!=verNum) + stop(paste0('Please use data template ',verNum,' for this analysis.')) + parameters=read.xlsx(inputFile,sheet='Parameters',colNames=T,rowNames=T,na.strings=NULL) + if (!is.null(parameters)&&dim(parameters)[1]>0) { + if (!is.na(parameters[1,1])) + dataFormat=tolower(as.character(parameters[1,1])) + if (!is.na(parameters[2,1])) + valueFilter=as.numeric(parameters[2,1]) + if (!is.na(parameters[3,1])) + groupCountFilter=as.integer(parameters[3,1]) + if (!is.na(parameters[4,1])) + groupMeanLowerBound=as.numeric(parameters[4,1]) + if (sum(!is.na(parameters[5,]))) + pCutoffs=as.numeric(parameters[5,!is.na(parameters[5,])]) + if (sum(!is.na(parameters[6,]))) + fcCutoffs=as.numeric(parameters[6,!is.na(parameters[6,])]) + if (!is.na(parameters[7,1])) + normalization=parameters[7,1] + if (normalization=='BySize'&&sum(!is.na(sampleIDs[,4]))!=nSamples) + stop('If you provide size factors, they should be present for all samples.') + if (normalization=='ByUpperQuantile'&&!is.na(parameters[7,2])) + uqProb=as.numeric(parameters[7,2]) + if (!is.na(parameters[8,1])&¶meters[8,1]=='No') + logTransform=F + if (!is.na(parameters[9,1])&&as.numeric(parameters[9,1])>=0) + logOffset=as.numeric(parameters[9,1]) + if (!is.na(parameters[10,1])) + statTest=parameters[10,1] + if (!is.na(parameters[11,1])) + dataImputation=tolower(as.character(parameters[11,1])) + if (!is.na(parameters[12,1])&¶meters[12,1]=='No') + visual=F + if (!is.na(parameters[13,1])) + volcanoP=tolower(as.character(parameters[13,1])) + if (!is.na(parameters[14,1])&¶meters[14,1]=='No') + hclustSamples=F + if (!is.na(parameters[15,1])&¶meters[15,1]=='No') + hclustFeatures=F + } +} else + stop('Parameter sheet is missing.') +if ('Comparisons' %in% sheetNames) { + dePairs=read.xlsx(inputFile,sheet='Comparisons',colNames=T,rowNames=F,cols=c(1:2)) + tempStr=unlist(dePairs) + if (sum(!(tempStr[!is.na(tempStr)] %in% sampleIDs[,1]))>0) + stop('Unknown group ID(s) in the Comparisons sheet.') + if (!is.null(dePairs)&&dim(dePairs)[1]>0) { + controlGroups=dePairs[!is.na(dePairs[,1])&is.na(dePairs[,2]),1] + dePairs=dePairs[!is.na(dePairs[,1])&!is.na(dePairs[,2]),1:2] + } +} +if ('Features' %in% sheetNames) { + tempFeatures=read.xlsx(inputFile,sheet='Features',colNames=T,rowNames=F,cols=c(1:2)) + if (!is.null(tempFeatures)&&dim(tempFeatures)[1]>0) { + interestedFeatures=tempFeatures[!is.na(tempFeatures[,1]),1] + excludedFeatures=tempFeatures[!is.na(tempFeatures[,2]),2] + } +} +#############################End Settings############################# + +#############################Begin Reading Data############################# +#Please read instructions in the data_template.xlsm and use the RawData sheet to hold your data. +#If the data are generated by the metabolomics core facility at CRI, UTSW, usually they are in the list format. +#If the data are in the list format, in the excel sheet three columns will be read: feature ID in column 1, sample ID in column 2, and value in column 5. +#If the data are in the table format, in the excel sheet all columns will be read: feature ID in column 1 and sample values in the next columns. +message('Reading data ...') +if (dataFormat=='list') { + rawData=read.xlsx(inputFile,sheet='RawData',colNames=T)[,c(1,2,5)] +} else if (dataFormat=='table') { + data=read.xlsx(inputFile,sheet='RawData',colNames=T,rowNames=F) + featureDescription=NULL + rawData=melt(data,id.vars=1) + rawData[,2]=as.character(rawData[,2]) + rawData[,3]=as.numeric(rawData[,3]) +} else if (dataFormat=='table with feature description') { + data=read.xlsx(inputFile,sheet='RawData',colNames=T,rowNames=F) + featureDescription=data[,1:2] + data=data[,-2] + rawData=melt(data,id.vars=1) + rawData[,2]=as.character(rawData[,2]) + rawData[,3]=as.numeric(rawData[,3]) +} else { + stop('Only list and table are supported data formats. Check your Parameters sheet.') +} +colnames(rawData)=c('Feature','Sample','Value') +if (!is.null(excludedFeatures)) + rawData=rawData[!(rawData$Feature %in% excludedFeatures),] #If speficied, features are excluded. +rawData=rawData[!is.na(sampleIDs[rawData$Sample,1]),] #Samples with empty group IDs are excluded. +if (sum(is.na(rawData$Value))) + message('Missing values are detected.') +#rawData[is.na(rawData$Value),'Value']=0 +if(sum(rawData$Value<0,na.rm=T)&&(normalization!='No'||logTransform)) + stop('Negative value(s) are not allowed when you normalize or log-transform your data.') +#The following code is to generate automatic feature IDs in case they are not unique. +if (length(unique(rawData$Feature))*length(unique(rawData$Sample))!=dim(rawData)[1]) { + rowNames=rawData$Feature + ind=data.frame() + for (i in 1:dim(rawData)[1]) { + r=which(rownames(ind)==rawData$Feature[i]) + c=which(colnames(ind)==rawData$Sample[i]) + if (is.na(r==0||c==0)||is.na(ind[r,c])) { + + ind[rawData$Feature[i],rawData$Sample[i]]=1 + } else { + #Currently the auto-ID function is disabled so that problems caused by mistakes are not ignored. If preferred, comment the following line to enable this function. + stop(paste0('Duplicated feature or sample name(s) found at ',r,' : ',c,',. Please check your data.')) + rowNames[i]=paste0(rowNames[i],'.',ind[r,c]) + ind[r,c]=ind[r,c]+1 + } + } + rawData$Feature=rowNames #In case there are duplicated feature names, they are replaced by name.1, name.2, and so on. +} +rawData[,colnames(sampleIDs)]=sampleIDs[rawData$Sample,] +rawData=aggregate(Value~Feature+BatchID+GroupID+BiologicalReplicateID,FUN=mean,data=rawData) #Data are stored in a list. Technical replicates are averaged if present. +rawData$Sample=paste(rawData$BatchID,rawData$GroupID,rawData$BiologicalReplicateID,sep='_') +data=dcast(rawData,Feature~BatchID+GroupID+BiologicalReplicateID,value.var='Value') #Data are also stored in a table with features in rows and samples in columns. +sampleIDs=unique(sampleIDs) +sampleIDs$OriginalID=rownames(sampleIDs) +rownames(sampleIDs)=paste(sampleIDs$BatchID,sampleIDs$GroupID,sampleIDs$BiologicalReplicateID,sep='_') +rownames(data)=data[,1] +data=data[,-1] +data=as.matrix(data) +class(data)='numeric' +sampleIDs=sampleIDs[colnames(data),] +#############################End Reading Data############################# + +wb = createWorkbook() +style1=createStyle(textDecoration='Bold') +style2=createStyle(textDecoration='Bold',wrapText=T) +addWorksheet(wb,'Visual') +addWorksheet(wb, 'Data') +addWorksheet(wb, 'Overall') +addWorksheet(wb, 'Group_Features') +addWorksheet(wb, 'Sample_Features') + +#############################Begin Sheet "Data"############################# +message('Writing the data sheet ...') +if (dataFormat=='table with feature description') { + rownames(featureDescription)=featureDescription[,1] + featureDescription=featureDescription[rownames(featureDescription)%in%rownames(data),] + featureDescription=featureDescription[rownames(data),-1] + writeData(wb,'Data',t(rbind(NA,sampleIDs)),rowNames=T,colNames=T,keepNA=F) + writeData(wb,'Data','',startRow=1,startCol=2) + writeData(wb,'Data',data.frame(Description=featureDescription,data,stringsAsFactors=F,check.names=F),startRow=7,rowNames=T,colNames=F) + writeData(wb,'Data','Description',startRow=6,startCol=2) + freezePane(wb,'Data',firstActiveRow=7,firstActiveCol=3) +} else { + writeData(wb,'Data',t(sampleIDs),rowNames=T,colNames=T) + writeData(wb,'Data',data,startRow=7,rowNames=T,colNames=F) + freezePane(wb,'Data',firstActiveRow=7,firstActiveCol=2) +} +nSamples=dim(data)[2] #Bug fixed on 8/5/19. +nFeatures=dim(data)[1] +groupID=as.factor(sampleIDs[,1]) +names(groupID)=rownames(sampleIDs) +groups=levels(groupID) +nGroups=length(groups) +repID=as.factor(sampleIDs[,2]) +names(repID)=rownames(sampleIDs) +batchID=as.factor(sampleIDs[,3]) +names(batchID)=rownames(sampleIDs) +formulaStr='value' +if (length(levels(groupID))>1) { + if (formulaStr=='value') + formulaStr=paste(formulaStr,'group',sep='~') + else + formulaStr=paste(formulaStr,'group',sep='+') +} +if (length(levels(repID))>1&&length(levels(repID))<nSamples) { + if (formulaStr=='value') + formulaStr=paste(formulaStr,'rep',sep='~') + else + formulaStr=paste(formulaStr,'rep',sep='+') +} +if (length(levels(batchID))>1) { + if (formulaStr=='value') + formulaStr=paste(formulaStr,'batch',sep='~') + else + formulaStr=paste(formulaStr,'batch',sep='+') +} +formulaForFitting=NULL +if (formulaStr!='value') + formulaForFitting=as.formula(formulaStr) +#############################End Sheet "Data"############################# + +#############################Begin Sheet "Normalized_Log2_Data"############################# +message ('Normalizing and transforming data ...') +normData=rawData #Long-list +sizeRatios=data #Wide-table +sizeRatios=sizeRatios[rowSums(sizeRatios,na.rm=T)!=0,] #All-zero rows are not used for normalization. +if (normalization!='No'||logTransform) { + sheetName='Data' + if (normalization!='No') { + if (normalization=='RLE') { + sizeRatios[sizeRatios==0]=NA #Zero values are ignored. +# sizeRatios=sizeRatios/exp(rowMeans(log(sizeRatios),na.rm=T)) #Buggy: Each row is divided by its geometric mean. + sizeRatios=sweep(sizeRatios,1,exp(rowMeans(log(sizeRatios),na.rm=T)),'/') #Each row is divided by its geometric mean. + sizeFactors=colMedians(sizeRatios,na.rm=T) #Median of ratios within each sample is taken as its size factor. + sizeRatios=melt(log2(sizeRatios),varnames=c('Feature','Sample'), value.name ='Value') + sizeRatios$Sample=as.character(sizeRatios$Sample) + sizeRatios[,colnames(sampleIDs)]=sampleIDs[sizeRatios$Sample,] + } + else { + if (normalization=='BySize') + sizeFactors=sampleIDs[,4] #If normalize by size, size factors are taken from the input file. Size factors can be e.g. cell counts, spike-in amount etc. + if (normalization=='ByMedian') #If normalize by median, the median feature value of each sample is taken as its size factor. + sizeFactors=colMedians(sizeRatios) + if (normalization=='ByUpperQuantile') #If normalize by upper quantile, the (uqProb*100)-th quantile value of each sample is taken as its size factor. uqProb is taken from the input file or defaults to 0.75 i.e. the upper quartile. + sizeFactors=colQuantiles(sizeRatios,probs=uqProb,na.rm=T) + if (normalization=='BySum') #If normalize by sum, the sum of feature values of each sample is taken as its size factor. + sizeFactors=colSums(sizeRatios,na.rm=T) + } + if (sum(sizeFactors==0)==0) { + sizeFactors=sizeFactors/median(sizeFactors) + data=sweep(data,2,sizeFactors,'/') #Raw data are normalized by the size factor per sample. A bug regarding this line is fixed. + names(sizeFactors)=colnames(data) + normData$Value=normData$Value/sizeFactors[normData$Sample] + sheetName=paste0('Norm_',sheetName) + } + else { + message('Size factor(s)=0. No normalization is applied.') + normalization='No' + } + } + if (logTransform) { + if (sum(data+logOffset<=0,na.rm=T)) + stop('Offset of log2-transformation is too small to make all values positive.') + data=log2(data+logOffset) + sheetName=paste0('Log2(x+',logOffset,')_',sheetName) + } + addWorksheet(wb, sheetName) + if (normalization!='No') { + sampleIDs=data.frame(sampleIDs[,1:4],SizeFactor=sizeFactors,OriginalID=sampleIDs[,5]) + if (dataFormat=='table with feature description') { + writeData(wb,sheetName,t(rbind(NA,sampleIDs)),rowNames=T,colNames=T,keepNA=F) + writeData(wb,sheetName,'',startRow=1,startCol=2) + writeData(wb,sheetName,'Description',startRow=7,startCol=2) + writeData(wb,sheetName,data.frame(Description=featureDescription,data,stringsAsFactors=F,check.names=F),startRow=8,rowNames=T,colNames=F) + freezePane(wb,sheetName,firstActiveRow=8,firstActiveCol=3) + } else { + writeData(wb,sheetName,t(sampleIDs),rowNames=T,colNames=T) + writeData(wb,sheetName,data,startRow=8,rowNames=T,colNames=F) + freezePane(wb,sheetName,firstActiveRow=8,firstActiveCol=2) + } + } else { + if (dataFormat=='table with feature description') { + writeData(wb,sheetName,t(rbind(NA,sampleIDs)),rowNames=T,colNames=T,keepNA=F) + writeData(wb,sheetName,'',startRow=1,startCol=2) + writeData(wb,sheetName,data.frame(Description=featureDescription,data,stringsAsFactors=F,check.names=F),startRow=7,rowNames=T,colNames=F) + writeData(wb,sheetName,'Description',startRow=6,startCol=2) + freezePane(wb,sheetName,firstActiveRow=7,firstActiveCol=3) + } else { + writeData(wb,sheetName,t(sampleIDs),rowNames=T,colNames=T) + writeData(wb,sheetName,data,startRow=7,rowNames=T,colNames=F) + freezePane(wb,sheetName,firstActiveRow=7,firstActiveCol=2) + } + } +} +#############################End Sheet "Normalized_Log2_Data"############################# + +#############################Begin Sheet "Selected_Features"############################# +message ('Writing the selected features sheet ...') +nSelectedFeatures=0 +if (length(interestedFeatures)>0) { + selectedFeatures=rownames(data) %in% interestedFeatures + nSelectedFeatures=sum(selectedFeatures) +} +if (nSelectedFeatures>0) { + if (dataFormat=='table with feature description') { + report=data.frame(Description=featureDescription,data,stringsAsFactors=F,check.names=F)[selectedFeatures,] + } else { + report=data.frame(data,stringsAsFactors=F,check.names=F)[selectedFeatures,] + } + addWorksheet(wb, 'Selected_Features') + writeData(wb,'Selected_Features',report,rowNames=T) + writeData(wb,'Selected_Features','Feature') + addStyle(wb,'Selected_Features',style2,rows=1,cols=1:200) +} +#############################End Sheet "Selected_Features"############################# + +#############################Begin Sheet "Visual"############################# +if (visual) { +message ('Generating figures ...') +options(warn=-1) +plotID=0 +plotTypes=c('raw') +if (logTransform) + plotTypes=c(plotTypes,'log2') +if (normalization!='No') { + plotTypes=c(plotTypes,'norm') + if (logTransform) + plotTypes=c(plotTypes,'log2_norm') +} +if (normalization=='RLE') + plotTypes=c(plotTypes,'sizes') +if (nSelectedFeatures>0) + plotTypes=c(plotTypes,paste0(plotTypes,'_features')) +for (plotType in plotTypes) { + if (grepl('sizes',plotType)) { + plotData=sizeRatios + titleText='Log2-Transformed Size Ratios for RLE Normalization' + } else if (grepl('norm',plotType)) { + plotData=normData + titleText='Normalized Data' + } else { + plotData=rawData + titleText='Raw Data' + } + if (grepl('log2',plotType)) { + plotData$Value=log2(plotData$Value+logOffset) + titleText=paste0('Log2(x+',logOffset,')-Transformed ',titleText) + } + if (grepl('features',plotType)) { + plotData=plotData[plotData$Feature %in% interestedFeatures,] + titleText=paste0(titleText,' (Selected Features Only)') + } + if (sum(!is.na(plotData$Feature))==0) + next + tempData=dcast(plotData,Feature~Sample,value.var='Value') #Table format of the plotData. + rownames(tempData)=tempData[,1] + tempData=as.matrix(tempData[,-1]) + nonZeroFeatures=rownames(tempData)[rowSums(tempData,na.rm=T)!=0] + if (length(nonZeroFeatures)>0) { + plotData=plotData[plotData$Feature %in% nonZeroFeatures,] #Zero features are excluded from visualization. + plotFile=file.path(figuresPath,paste0(plotType,'.ps')) + plotFile2=file.path(figuresPath,paste0(plotType,'.png')) + p1=ggplot(plotData, aes(x=Sample, y=Value, color=GroupID))+geom_violin(color='black')+geom_jitter(position=position_jitter(0.2),size=0.5)+geom_boxplot(color='black',outlier.size=-1,width=0.1)+facet_grid(.~BatchID,scales='free_x',space='free_x')+theme(axis.text.x = element_text(angle = 90, hjust = 1),legend.position = "none")+ggtitle(paste0('Violin-Boxplot of ',titleText)) + p2=ggplot(plotData,aes(x=Value,color=GroupID))+geom_histogram(bins=20)+facet_grid(.~BatchID+Sample)+ggtitle(paste0('Histogram of ',titleText))+theme(strip.text.x = element_blank(),axis.text.x = element_text(angle = 90, hjust = 1),legend.position = "none") + statsData=melt(do.call(data.frame,aggregate(Value~Sample,FUN=function(x) c(Median=median(x),Sum=sum(x)),data=plotData)),id.vars='Sample') + colnames(statsData)=c('Sample','Stat','Value') + statsData$Sample=as.character(statsData$Sample) + statsData$GroupID=sampleIDs[statsData$Sample,1] + statsData$BatchID=sampleIDs[statsData$Sample,3] + p3=ggplot(statsData,aes(x=Sample,y=Value,color=GroupID))+geom_bar(stat='identity')+facet_grid(Stat~BatchID,scales='free',space='free_x')+theme(axis.text.x = element_text(angle = 90, hjust = 1),legend.position = "none")+ggtitle(paste0('Statistics of ',titleText)) + p=plot_grid(p1,p2,align='v',axis='tblr',nrow=2,rel_heights = c(3/5, 2/5)) + p=plot_grid(p,p3,ncol=2,rel_widths=c(3/4,1/4)) + ggsave(plotFile,plot=p,width=min(50,max(dim(data)[2],12)),height=8,units='in',limitsize=F) + ggsave(plotFile2,plot=p,width=min(50,max(dim(data)[2],12)),height=8,units='in',limitsize=F) + insertImage(wb,'Visual',plotFile2, width=dim(data)[2], height=8, unit='in',startRow=plotID*45+1,startCol=1) + plotID=plotID+1 + tempData=tempData[rowSums(is.na(tempData))==0&apply(tempData,1,sd)>0,] #Clustering doesn't work with NAs. Scaling doesn't work with zero-sd rows. + if (!is.null(dim(tempData))) + if (dim(tempData)[1]>1) { + plotFile=file.path(figuresPath,paste0(plotType,'_heatmap.ps')) + plotFile2=file.path(figuresPath,paste0(plotType,'_heatmap.png')) + p=pheatmap(tempData,main=paste0('Heatmap of ',titleText),border_color=NA,show_rownames=grepl('features',plotType),treeheight_row=0,annotation_col=sampleIDs[colnames(tempData),c('BatchID','GroupID')],annotation_legend=T,na_col='gray',scale='none',col=heatmapColors,cluster_rows=hclustFeatures,cluster_cols=hclustSamples) + ggsave(plotFile,plot=p$gtable,width=min(50,max(dim(data)[2],12)),height=8,units='in',limitsize=F) + ggsave(plotFile2,plot=p$gtable,width=min(50,max(dim(data)[2],12)),height=8,units='in',limitsize=F) + insertImage(wb,'Visual',plotFile2, width=min(50,max(dim(data)[2]/3,4)), height=8, unit='in',startRow=plotID*45+1,startCol=1) + plotFile=file.path(figuresPath,paste0(plotType,'_scaled_heatmap.ps')) + plotFile2=file.path(figuresPath,paste0(plotType,'_scaled_heatmap.png')) + p=pheatmap(tempData,main=paste0('Scaled Heatmap of ',titleText),border_color=NA,show_rownames=grepl('features',plotType),treeheight_row=0,annotation_col=sampleIDs[colnames(tempData),c('BatchID','GroupID')],annotation_legend=T,na_col='gray',scale='row',col=heatmapColors,cluster_rows=hclustFeatures,cluster_cols=hclustSamples) + ggsave(plotFile,plot=p$gtable,width=min(50,max(dim(data)[2],12)),height=8,units='in',limitsize=F) + ggsave(plotFile2,plot=p$gtable,width=min(50,max(dim(data)[2],12)),height=8,units='in',limitsize=F) + insertImage(wb,'Visual',plotFile2, width=min(50,max(dim(data)[2]/3,4)), height=8, unit='in',startRow=plotID*45+1,startCol=ceiling(max(dim(data)[2]/3,4)*2)) + plotFile=file.path(figuresPath,paste0(plotType,'_pca_cor.ps')) + plotFile2=file.path(figuresPath,paste0(plotType,'_pca_cor.png')) + plotData=data.frame(prcomp(t(tempData))$x,sampleIDs) + p1=ggplot(plotData,aes(x=PC1,y=PC2,color=GroupID))+geom_point(aes(shape=BatchID))+ggtitle(paste0('PCA of ',titleText)) + if (dim(tempData)[1]>2) + pMat=cor_pmat(tempData) + else + pMat=NULL + if (sum(is.na(cor(tempData)))>0) + hClust=F + else + hClust=T + p2=ggcorrplot(cor(tempData), hc.order = hClust, outline.col = "white",p.mat=pMat)+ggtitle(paste0('Pearson Correlation of ',titleText)) + p=plot_grid(p1,p2,ncol=2,rel_widths=c(1/3,2/3)) +# p2=ggpairs(as.data.frame(tempData),lower=list(continuous='smooth'),title=paste0('Scatter Plot of ',titleText)) +# p=plot_grid(p1,ggmatrix_gtable(p2),ncol=2,rel_widths=c(1/3,2/3)) + ggsave(plotFile,plot=p,width=min(50,max(dim(data)[2]*2,16)),height=8,units='in',limitsize=F) + ggsave(plotFile2,plot=p,width=min(50,max(dim(data)[2]*2,16)),height=8,units='in',limitsize=F) + insertImage(wb,'Visual',plotFile2, width=min(50,max(dim(data)[2]*2,16)), height=8, unit='in',startRow=plotID*45+1,startCol=ceiling(max(dim(data)[2]/3,4)*4)) + plotID=plotID+1 + } + } +} +} #End if (visual) +options(warn=0) +#############################End Sheet "Visual"############################# + +#############################Begin Sheet "Overall"############################# +message('Writing the overall summary sheet ...') +featureExpressed=(data>valueFilter) +nValueFilteredFeatures=sum(rowSums(featureExpressed,na.rm=T)>0) +if (sum(data>valueFilter,na.rm=T)>0 && groupMeanLowerBound==0) { + if (logTransform) + groupMeanLowerBound=min(data[data>valueFilter],na.rm=T)-1 #Bug fix: this was added on 8/8/19. + else + groupMeanLowerBound=min(data[data>valueFilter],na.rm=T)/2 +} +report=matrix(nrow=19,ncol=1) +colnames(report)='Value' +rownames(report)=c('Samples','Groups','Features','Value-filtered features','Value filter','Count filter','Group mean lower-bound','Adjusted p-value cutoff','Fold change cutoff','Normalization','Log2 transformation','Offset of log2 transformation','Statistical test method','Data imputation method','Formula for linear model','Visualization','Y axis for volcano plot','Clustered samples','Clustered features') +report[1]=nSamples +report[2]=nGroups +report[3]=nFeatures +report[4]=nValueFilteredFeatures +report[5]=valueFilter +report[6]=groupCountFilter +report[7]=groupMeanLowerBound +report[8]=paste(pCutoffs,collapse=',') +report[9]=paste(fcCutoffs,collapse=',') +report[10]=normalization +if (normalization=='ByUpperQuantile') + report[10]=paste0(report[10],'-',uqProb*100,'%') +if (logTransform) { + report[11]='Yes' +} else { + report[11]='No' +} +report[12]=logOffset +report[13]=statTest +report[14]=dataImputation +report[15]=formulaStr +if (visual) { + report[16]='Yes' +} else { + report[16]='No' +} +report[17]=volcanoP +if (hclustSamples) { + report[18]='Yes' +} else { + report[18]='No' +} +if (hclustFeatures) { + report[19]='Yes' +} else { + report[19]='No' +} +writeData(wb,'Overall',report,rowNames=T,borders='surrounding') +writeData(wb,'Overall',verNum) +writeComment(wb,'Overall',2,2,createComment('Total number of samples',visible=F)) +writeComment(wb,'Overall',2,3,createComment('Total number of groups',visible=F)) +writeComment(wb,'Overall',2,4,createComment('Total number of features e.g. metabolites, proteins, etc.',visible=F)) +writeComment(wb,'Overall',2,5,createComment('Total number of features with at least one sample that\'s greater than the cutoff; these features are considered \"expressed\"',visible=F)) +writeComment(wb,'Overall',2,6,createComment('Feature value cutoff; only feature values greater than this number are considered \"expressed\"',visible=F)) +writeComment(wb,'Overall',2,7,createComment('Within-group count cutoff; only features expressed by more than this number of replicates in a group will be included for comparisons',visible=F)) +writeComment(wb,'Overall',2,8,createComment('Group mean values less than this number will be replaced by it',visible=F)) +writeComment(wb,'Overall',2,9,createComment('Adjusted p-values less than this number will be considered significant',visible=F)) +writeComment(wb,'Overall',2,10,createComment('Fold changes greater than this number will be considered significant',visible=F)) +writeComment(wb,'Overall',2,11,createComment('If Yes, non-zero values are used to calculate a geometric mean per feature, feature values are divided by the geometric mean to obtain ratios, and the median of such ratios per sample is used as an estimation of the size factor per sample. Feature values in a sample are divided by this sample-specific size factor.',visible=F)) +writeComment(wb,'Overall',2,12,createComment('If yes, data are log2-transformed with or without an offset.',visible=F)) +writeComment(wb,'Overall',2,13,createComment('If >0, data=data+offset before the log2-transformation, otherwise only the positive values are log2-transformed but zeros are unchanged.',visible=F)) +writeComment(wb,'Overall',2,14,createComment('Statistical test method.',visible=F)) +writeComment(wb,'Overall',2,15,createComment('Data imputation method.',visible=F)) +writeComment(wb,'Overall',2,16,createComment('Formula for the linear model such as cpglm, glm etc..',visible=F)) +writeComment(wb,'Overall',2,17,createComment('If yes, figures are saved in the result file as well as in a Figure folder.',visible=F)) +writeComment(wb,'Overall',2,18,createComment('Y axis used for the volcano plot.',visible=F)) +writeComment(wb,'Overall',2,19,createComment('If yes, samples on the heatmaps are hierarchical-clustered.',visible=F)) +writeComment(wb,'Overall',2,20,createComment('If yes, features on the heatmaps are hierarchical-clustered.',visible=F)) + +report=matrix(nrow=nGroups,ncol=2) +colnames(report)=c('Samples','Value-count-filtered features') +rownames(report)=groups +sumGroupCount=matrix(nrow=nFeatures,ncol=nGroups) +colnames(sumGroupCount)=groups +for (i in 1:nGroups) { + groupCount=featureExpressed[,groupID==groups[i],drop=F] + sumGroupCount[,i]=rowSums(groupCount,na.rm=T)>=groupCountFilter + report[groups[i],1]=dim(groupCount)[2] + report[groups[i],2]=sum(sumGroupCount[,i]) + groupCount[is.na(groupCount)]=F + if (visual&sum(groupID==groups[i])<=6) { + ggsave(file.path(figuresPath,paste0('euler_',i,'.ps')),plot=plot(euler(groupCount),quantities=T,col=colors[1:dim(groupCount)[2]],fill=F,fill_opacity=0.7,legend=T)) + ggsave(file.path(figuresPath,paste0('euler_',i,'.png')),plot=plot(euler(groupCount),quantities=T,col=colors[1:dim(groupCount)[2]],fill=F,fill_opacity=0.7,legend=T)) + } +} +if (visual&nGroups<=6) { + ggsave(file.path(figuresPath,'euler_all.ps'),plot=plot(euler(sumGroupCount),quantities=T,col=colors[1:nGroups],fill=F,fill_opacity=0.7,legend=T)) + ggsave(file.path(figuresPath,'euler_all.png'),plot=plot(euler(sumGroupCount),quantities=T,col=colors[1:nGroups],fill=F,fill_opacity=0.7,legend=T)) +} +writeData(wb,'Overall',report,rowNames=T,startCol=5,borders='surrounding') +writeData(wb,'Overall','Group',startCol=5) +writeComment(wb,'Overall',5,1,createComment('Group name; a group is a set of replicates to be used in a statistical comparison',visible=F)) +writeComment(wb,'Overall',6,1,createComment('Number of replicates in a group',visible=F)) +writeComment(wb,'Overall',7,1,createComment('Number of features that pass both the value and the within-group count filters',visible=F)) + +report=matrix(nrow=nSamples,ncol=1) +colnames(report)='Value-filtered features' +rownames(report)=colnames(data) +for (i in 1:nSamples) + report[i]=sum(featureExpressed[,i],na.rm=T) +writeData(wb,'Overall',report,rowNames=T,startCol=10,borders='surrounding') +writeData(wb,'Overall','Sample',startCol=10) +writeComment(wb,'Overall',10,1,createComment('Sample name',visible=F)) +writeComment(wb,'Overall',11,1,createComment('Number of features that pass the value filter',visible=F)) +addStyle(wb,'Overall',style1,rows=1,cols=1:100) +setColWidths(wb,'Overall',1:100,widths='auto') +freezePane(wb,'Overall',firstActiveRow=2) + +if (visual) { + if (nGroups<=6) + insertImage(wb,'Overall',file.path(figuresPath,'euler_all.png'),width=4,height=4,unit='in',startRow=nSamples+4,startCol=7) + for (i in 1:nGroups) { + if (sum(groupID==groups[i])<=6) + insertImage(wb,'Overall',file.path(figuresPath,paste0('euler_',i,'.png')),width=4,height=4,unit='in',startRow=nSamples+4+i*20,startCol=7) + } +} + +#############################End Sheet "Overall"############################# + +#############################Begin Sheet "Group_Features"############################# +message('Writing the group features sheet ...') +report=matrix(nrow=nFeatures,ncol=nGroups) +colnames(report)=groups +rownames(report)=rownames(data) +for (i in groups) + report[,i]=rowSums(featureExpressed[,groupID==i,drop=F],na.rm=T) +groupExpressed=report>=groupCountFilter +if (dataFormat=='table with feature description') { + report=data.frame(Description=featureDescription,Groups=rowSums(groupExpressed),report,stringsAsFactors=F,check.names=F) + freezePane(wb,'Group_Features',firstActiveRow=2,firstActiveCol=3) +} else { + report=data.frame(Groups=rowSums(groupExpressed),report,stringsAsFactors=F,check.names=F) + freezePane(wb,'Group_Features',firstActiveRow=2,firstActiveCol=2) +} +writeData(wb,'Group_Features',report,rowNames=T) +writeData(wb,'Group_Features','Feature') +writeComment(wb,'Group_Features',1,1,createComment('Feature name',visible=F)) +writeComment(wb,'Group_Features',2,1,createComment('Each cell in this column tells the number of groups that pass the count filter for a specific feature',visible=F)) +writeComment(wb,'Group_Features',3,1,createComment('Each cell in this or the succeeding columns tells the number of replicates in a group that pass the value filter for a specific feature',visible=F)) +addStyle(wb,'Group_Features',style2,rows=1,cols=1:100) +if (nSelectedFeatures>0) { + report=report[selectedFeatures,] + writeData(wb,'Selected_Features',report,rowNames=T,startRow=nSelectedFeatures+4) + writeData(wb,'Selected_Features','Feature',startRow=nSelectedFeatures+4) + addStyle(wb,'Selected_Features',style2,rows=nSelectedFeatures+4,cols=1:200) +} +#############################End Sheet "Group_Features"############################# + +#############################Begin Sheet "Sample_Features"############################# +message('Writing the sample features sheet ...') +report=featureExpressed +class(report)='integer' +if (dataFormat=='table with feature description') { + report=data.frame(Description=featureDescription,Samples=rowSums(featureExpressed),report,stringsAsFactors=F,check.names=F) + freezePane(wb,'Sample_Features',firstActiveRow=2,firstActiveCol=3) +} else { + report=data.frame(Samples=rowSums(featureExpressed),report,stringsAsFactors=F,check.names=F) + freezePane(wb,'Sample_Features',firstActiveRow=2,firstActiveCol=2) +} +writeData(wb,'Sample_Features',report,rowNames=T) +writeData(wb,'Sample_Features','Feature') +writeComment(wb,'Sample_Features',1,1,createComment('Feature name',visible=F)) +writeComment(wb,'Sample_Features',2,1,createComment('Each cell in this column tells the number of samples that pass the value filter for a specific feature',visible=F)) +writeComment(wb,'Sample_Features',3,1,createComment('Each cell in this or the succeeding columns tells whether the value passes the value filter for a specific feature; 1 means yes and 0 means no',visible=F)) +addStyle(wb,'Sample_Features',style2,rows=1,cols=1:200) +if (nSelectedFeatures>0) { + report=report[selectedFeatures,] + writeData(wb,'Selected_Features',report,rowNames=T,startRow=2*nSelectedFeatures+7) + writeData(wb,'Selected_Features','Feature',startRow=2*nSelectedFeatures+7) + addStyle(wb,'Selected_Features',style2,rows=2*nSelectedFeatures+7,cols=1:200) +} +#############################End Sheet "Sample_Features"############################# + +comparisonsSummary=matrix(nrow=0,ncol=11) +colnames(comparisonsSummary)=c('Control','Treatment','Control_Only','Treatment_Only','Absent_in_Both','Present_in_Both','Up_in_Treatment','Down_in_Treatment','Significant','Adjusted_p_Cutoff','FC_Cutoff') +options(warn=2) +if (dataImputation!='no') + data[data<=valueFilter]=NA +if (dataImputation=='value filter') + data[is.na(data)]=valueFilter +if (dataImputation=='mean') + tempData=rowMeans(data,na.rm=T) +if (dataImputation=='median') + tempData=rowMedians(data,na.rm=T) +if (dataImputation=='half minimum') { + tempData=rowMins(data,na.rm=T) + if (logTransform) + tempData=tempData-1 + else + tempData=tempData/2 +} +if (dataImputation=='mean' || dataImputation=='median' || dataImputation=='half minimum') { + tempData[tempData==Inf]=groupMeanLowerBound + tempData[is.nan(tempData)]=groupMeanLowerBound + tempData[is.na(tempData)]=groupMeanLowerBound + tempData=replicate(dim(data)[2],tempData) + data[is.na(data)]=tempData[is.na(data)] +} +#############################Begin Sheet "Control_vs_Others"############################# +message ('Writing the differential expression result for control vs. other groups ...') +nControls=length(controlGroups) +if (nControls>0) { + for (k in 1:nControls) { + report=matrix(NA,nrow=nFeatures,ncol=(nGroups-1)*6+2) + controlGroup=controlGroups[k] + treatGroups=groups[groups!=controlGroup] + group=C(groupID,base=which(rownames(contrasts(groupID))==controlGroup)) + colnames(contrasts(group))=rownames(contrasts(group))[rownames(contrasts(group))!=controlGroup] + if (logTransform) + tempStr=c('_log2_Mean','_log2_Adjusted_Mean') + else + tempStr=c('_Mean','_Adjusted_Mean') + colnames(report)=c(paste0(controlGroup,tempStr),as.vector(t(outer(treatGroups,c(tempStr,'_log2_FC','_FC','_p_Value','_Adjusted_p'),paste0)))) + rownames(report)=rownames(data) + for (i in 1:nFeatures) { + group1=data[i,groupID==controlGroup] + if (sum(!is.na(group1))==0) + group1=rep(groupMeanLowerBound,length(group1)) + if(sum(!is.na(group1))==1) + sd1=0 + else + sd1=sd(group1,na.rm=T) + report[i,1]=mean(group1,na.rm=T) + report[i,2]=report[i,1] + if (report[i,2]<groupMeanLowerBound) + report[i,2]=groupMeanLowerBound + for (j in 1:(nGroups-1)) { + group2=data[i,groupID==treatGroups[j]] + if (sum(!is.na(group2))==0) + group2=rep(groupMeanLowerBound,length(group2)) + if(sum(!is.na(group2))==1) + sd2=0 + else + sd2=sd(group2,na.rm=T) + report[i,(j-1)*6+3]=mean(group2,na.rm=T) + report[i,(j-1)*6+4]=report[i,(j-1)*6+3] + if (report[i,(j-1)*6+4]<groupMeanLowerBound) + report[i,(j-1)*6+4]=groupMeanLowerBound + if (logTransform) + report[i,(j-1)*6+5]=report[i,(j-1)*6+4]-report[i,2] #Log2 FC + else + report[i,(j-1)*6+5]=log2(report[i,(j-1)*6+4]/report[i,2]) #Log2 FC + report[i,(j-1)*6+6]=2^report[i,(j-1)*6+5] + if ((groupExpressed[i,controlGroup] || groupExpressed[i,treatGroups[j]])) + if (statTest=='t-test') { + if (sd1==0&&sd2!=0) + report[i,(j-1)*6+7]=t.test(group2,mu=mean(group1,na.rm=T))$p.value + else if (sd1!=0&&sd2==0) + report[i,(j-1)*6+7]=t.test(group1,mu=mean(group2,na.rm=T))$p.value + else if (sd1!=0&&sd2!=0) + report[i,(j-1)*6+7]=t.test(group1,group2)$p.value + } else if (statTest=='cpglm-glm') { + testData=list(value=data[i,],group=group,rep=repID,batch=batchID) + fit=try(cpglm(formulaForFitting,data=testData,link='identity'),silent=T) + if (class(fit)=='cpglm'&&fit$converged) + coefTable=try(coef(summary(fit),silent=T)) + if (class(fit)=='cpglm'&&fit$converged&&class(coefTable)!='try-error') { + rowName=paste0("group",treatGroups[j]) + if (rowName %in% rownames(coefTable)) + report[i,(j-1)*6+7]=coefTable[rowName,4] + } else { + fit=try(glm(formulaForFitting,data=testData,family=gaussian(link='identity')),silent=T) + if (class(fit)=='glm'&&fit$converged) { + rowName=paste0("group",treatGroups[j]) + coefTable=coef(summary(fit)) + if (rowName %in% rownames(coefTable)) + report[i,(j-1)*6+7]=coefTable[rowName,4] + } + } + } else if (statTest=='glm') { + testData=list(value=data[i,],group=group,rep=repID,batch=batchID) + fit=try(glm(formulaForFitting,data=testData,family=gaussian(link='identity')),silent=T) + if (class(fit)=='glm'&&fit$converged) { + rowName=paste0("group",treatGroups[j]) + coefTable=coef(summary(fit)) + if (rowName %in% rownames(coefTable)) + report[i,(j-1)*6+7]=coefTable[rowName,4] + } + } + + } + } + for (j in 1:(nGroups-1)) { + report[,(j-1)*6+8]=p.adjust(report[,(j-1)*6+7],method='fdr') + } + report=data.frame(report,data[,groupID==controlGroup,drop=F],stringsAsFactors=F,check.names=F) + for (j in 1:(nGroups-1)) + report=data.frame(report,data[,groupID==treatGroups[j],drop=F],stringsAsFactors=F,check.names=F) + sheet = addWorksheet(wb, paste0(controlGroup,'_vs_Others')) + if (dataFormat=='table with feature description') { + writeData(wb,sheet,data.frame(Description=featureDescription,report),rowNames=T,keepNA=F,stringsAsFactors=F,check.names=F) + writeData(wb,sheet,'Feature') + freezePane(wb,sheet,firstActiveRow=2,firstActiveCol=3) + } else { + writeData(wb,sheet,report,rowNames=T,keepNA=F) + writeData(wb,sheet,'Feature') + freezePane(wb,sheet,firstActiveRow=2,firstActiveCol=2) + } + addStyle(wb,sheet,style2,rows=1,cols=1:200) + for (j in 1:(nGroups-1)) { + expressed1=groupExpressed[,controlGroup] + expressed2=groupExpressed[,treatGroups[j]] + fc=report[,(j-1)*6+6] + rfc=1/fc + pRaw=report[,(j-1)*6+7] + p=report[,(j-1)*6+8] + for (pc in pCutoffs) + for (fcc in fcCutoffs) + comparisonsSummary=rbind(comparisonsSummary,c(controlGroup,treatGroups[j],sum(expressed1&!expressed2),sum(!expressed1&expressed2),sum(!expressed1&!expressed2),sum(expressed1&expressed2),sum(!is.na(p)&fc>fcc&p<pc),sum(!is.na(p)&rfc>fcc&p<pc),sum(!is.na(p)&(fc>fcc|rfc>fcc)&p<pc),pc,fcc)) + if (grepl('raw',volcanoP)) + tempData=data.frame(log2(fc),-log(pRaw,10),F) + else + tempData=data.frame(log2(fc),-log(p,10),F) + rownames(tempData)=rownames(report) + tempData=tempData[rowSums(is.na(tempData))==0,] + colnames(tempData)=c('x','y','Selected') + options(warn=-1) + if (visual) { + plotType='volcano' + plotTitle=paste0('Volcano Plot of ',controlGroup,' vs. ',treatGroups[j]) + if (nSelectedFeatures>0) + tempData[,'Selected']=rownames(tempData) %in% interestedFeatures + else + tempData$Selected=F + p1=ggplot(data.frame(tempData),aes(x=x,y=y,color=Selected))+scale_color_manual(values=c('black','green'))+geom_point()+geom_text(aes(label=ifelse(Selected,rownames(tempData),'')),hjust=0, vjust=0)+xlab('Log2(FC)')+ylab(paste0('-Log10(',volcanoP,')'))+ggtitle(plotTitle)+theme(legend.position="none") + for (pc in pCutoffs) { + p1=p1+geom_hline(yintercept=-log(pc,10),linetype="dashed",color = "green",size=0.2) + for (fcc in fcCutoffs) { + p1=p1+geom_vline(xintercept=log2(fcc),linetype="dashed",color = "green",size=0.2) + p1=p1+geom_vline(xintercept=-log2(fcc),linetype="dashed",color = "green",size=0.2) + } + } + plotFile=file.path(figuresPath,paste0(controlGroup,'_vs_',treatGroups[j],'_',plotType,'.ps')) + plotFile2=file.path(figuresPath,paste0(controlGroup,'_vs_',treatGroups[j],'_',plotType,'.png')) + ggsave(plotFile,plot=p1,width=7,height=7,units='in') + ggsave(plotFile2,plot=p1,width=7,height=7,units='in') + insertImage(wb,'Visual',plotFile2, width=7, height=7, unit='in',startRow=plotID*45+1,startCol=(j-1)*13+1) + } + } + if (visual) + plotID=plotID+1 + options(warn=0) + if (nSelectedFeatures>0) { + if (dataFormat=='table with feature description') { + report=data.frame(Description=featureDescription,report,stringsAsFactors=F,check.names=F)[selectedFeatures,] + writeData(wb,'Selected_Features',report,rowNames=T,startRow=(k+2)*(nSelectedFeatures+3)+1,keepNA=F) + writeData(wb,'Selected_Features','Feature',startRow=(k+2)*(nSelectedFeatures+3)+1) + addStyle(wb,'Selected_Features',style2,rows=(k+2)*(nSelectedFeatures+3)+1,cols=1:200) + } else { + report=report[selectedFeatures,] + writeData(wb,'Selected_Features',report,rowNames=T,startRow=(k+2)*(nSelectedFeatures+3)+1,keepNA=F) + writeData(wb,'Selected_Features','Feature',startRow=(k+2)*(nSelectedFeatures+3)+1) + addStyle(wb,'Selected_Features',style2,rows=(k+2)*(nSelectedFeatures+3)+1,cols=1:200) + } + } + } +} +#############################End Sheet "Control_vs_Others"############################# + +#############################Begin Sheet "DE_Comparisons"############################# +message ('Writing the differential expression results for groups ...') +nComparisons=dim(dePairs)[1] +if (nComparisons>0) { + for (k in 1:nComparisons) { + report=matrix(NA,nrow=nFeatures,ncol=8) + controlGroup=dePairs[k,1] + treatGroup=dePairs[k,2] + group=C(groupID,base=which(rownames(contrasts(groupID))==controlGroup)) + colnames(contrasts(group))=rownames(contrasts(group))[rownames(contrasts(group))!=controlGroup] + if (logTransform) + tempStr=c('_log2_Mean','_log2_Adjusted_Mean') + else + tempStr=c('_Mean','_Adjusted_Mean') + colnames(report)=c(paste0(controlGroup,tempStr),paste0(treatGroup,tempStr),paste0(paste0(controlGroup,'_vs_',treatGroup),c('_log2_FC','_FC','_p_Value','_Adjusted_p'))) + rownames(report)=rownames(data) + for (i in 1:nFeatures) { + group1=data[i,groupID==controlGroup] + if (sum(!is.na(group1))==0) + group1=rep(groupMeanLowerBound,length(group1)) + if(sum(!is.na(group1))==1) + sd1=0 + else + sd1=sd(group1,na.rm=T) + report[i,1]=mean(group1,na.rm=T) + report[i,2]=report[i,1] + if (report[i,2]<groupMeanLowerBound) + report[i,2]=groupMeanLowerBound + group2=data[i,groupID==treatGroup] + if (sum(!is.na(group2))==0) + group2=rep(groupMeanLowerBound,length(group2)) + if(sum(!is.na(group2))==1) + sd2=0 + else + sd2=sd(group2,na.rm=T) + report[i,3]=mean(group2,na.rm=T) + report[i,4]=report[i,3] + if (report[i,4]<groupMeanLowerBound) + report[i,4]=groupMeanLowerBound + if (logTransform) + report[i,5]=report[i,4]-report[i,2] + else + report[i,5]=log2(report[i,4]/report[i,2]) + report[i,6]=2^report[i,5] + if ((groupExpressed[i,controlGroup] || groupExpressed[i,treatGroup])) + if (statTest=='t-test') { + if (sd1==0&&sd2!=0) + report[i,7]=t.test(group2,mu=mean(group1,na.rm=T))$p.value + else if (sd1!=0&&sd2==0) + report[i,7]=t.test(group1,mu=mean(group2,na.rm=T))$p.value + else if (sd1!=0&&sd2!=0) + report[i,7]=t.test(group1,group2)$p.value + } else if (statTest=='cpglm-glm') { + testData=list(value=data[i,],group=group,rep=repID,batch=batchID) + fit=try(cpglm(formulaForFitting,data=testData,link='identity'),silent=T) + if (class(fit)=='cpglm'&&fit$converged) + coefTable=try(coef(summary(fit)),silent=T) + if (class(fit)=='cpglm'&&fit$converged&&class(coefTable)!='try-error') { + rowName=paste0("group",treatGroup) + if (rowName %in% rownames(coefTable)) + report[i,7]=coefTable[rowName,4] + } else { + fit=try(glm(formulaForFitting,data=testData,family=gaussian(link='identity')),silent=T) + if (class(fit)=='glm'&&fit$converged) { + rowName=paste0("group",treatGroup) + coefTable=coef(summary(fit)) + if (exists('coefTable')&&rowName %in% rownames(coefTable)) + report[i,7]=coefTable[rowName,4] + } + } + } else if (statTest=='glm') { + testData=list(value=data[i,],group=group,rep=repID,batch=batchID) + fit=try(glm(formulaForFitting,data=testData,family=gaussian(link='identity')),silent=T) + if (class(fit)=='glm'&&fit$converged) { + rowName=paste0("group",treatGroup) + coefTable=coef(summary(fit)) + if (rowName %in% rownames(coefTable)) + report[i,7]=coefTable[rowName,4] + } + } + } + report[,8]=p.adjust(report[,7],method='fdr') + report=data.frame(report,data[,groupID==controlGroup,drop=F],data[,groupID==treatGroup,drop=F],stringsAsFactors=F,check.names=F) + sheet = addWorksheet(wb, paste0(controlGroup,'_vs_',treatGroup)) + if (dataFormat=='table with feature description') { + writeData(wb,sheet,data.frame(Description=featureDescription,report,stringsAsFactors=F,check.names=F),rowNames=T,keepNA=F) + writeData(wb,sheet,'Feature') + freezePane(wb,sheet,firstActiveRow=2,firstActiveCol=3) + } else { + writeData(wb,sheet,report,rowNames=T,keepNA=F) + writeData(wb,sheet,'Feature') + freezePane(wb,sheet,firstActiveRow=2,firstActiveCol=2) + } + addStyle(wb,sheet,style2,rows=1,cols=1:100) + expressed1=groupExpressed[,controlGroup] + expressed2=groupExpressed[,treatGroup] + fc=report[,6] + rfc=1/fc + pRaw=report[,7] + p=report[,8] + for (pc in pCutoffs) + for (fcc in fcCutoffs) + comparisonsSummary=rbind(comparisonsSummary,c(controlGroup,treatGroup,sum(expressed1&!expressed2),sum(!expressed1&expressed2),sum(!expressed1&!expressed2),sum(expressed1&expressed2),sum(!is.na(p)&fc>fcc&p<pc),sum(!is.na(p)&rfc>fcc&p<pc),sum(!is.na(p)&(fc>fcc|rfc>fcc)&p<pc),pc,fcc)) + if (grepl('raw',volcanoP)) + tempData=data.frame(log2(fc),-log(pRaw,10),F) + else + tempData=data.frame(log2(fc),-log(p,10),F) + rownames(tempData)=rownames(report) + tempData=tempData[rowSums(is.na(tempData))==0,] + colnames(tempData)=c('x','y','Selected') + options(warn=-1) + if (visual) { + plotType='volcano' + plotTitle=paste0('Volcano Plot of ',controlGroup,' vs. ',treatGroup) + if (nSelectedFeatures>0) + tempData[,'Selected']=rownames(tempData) %in% interestedFeatures + else + tempData$Selected=F + p1=ggplot(data.frame(tempData),aes(x=x,y=y,color=Selected))+scale_color_manual(values=c('black','green'))+geom_point()+geom_text(aes(label=ifelse(Selected,rownames(tempData),'')),hjust=0, vjust=0)+xlab('Log2(FC)')+ylab(paste0('-Log10(',volcanoP,')'))+ggtitle(plotTitle)+theme(legend.position="none") + for (pc in pCutoffs) { + p1=p1+geom_hline(yintercept=-log(pc,10),linetype="dashed",color = "green",size=0.2) + for (fcc in fcCutoffs) { + p1=p1+geom_vline(xintercept=log2(fcc),linetype="dashed",color = "green",size=0.2) + p1=p1+geom_vline(xintercept=-log2(fcc),linetype="dashed",color = "green",size=0.2) + } + } + plotFile=file.path(figuresPath,paste0(controlGroup,'_vs_',treatGroup,'_',plotType,'.ps')) + plotFile2=file.path(figuresPath,paste0(controlGroup,'_vs_',treatGroup,'_',plotType,'.png')) + ggsave(plotFile,plot=p1,width=7,height=7,units='in') + ggsave(plotFile2,plot=p1,width=7,height=7,units='in') + insertImage(wb,'Visual',plotFile2, width=7, height=7, unit='in',startRow=plotID*45+1,startCol=(k-1)*13+1) + } + if (nSelectedFeatures>0) { + if (dataFormat=='table with feature description') { + report=data.frame(Description=featureDescription,report,stringsAsFactors=F,check.names=F)[selectedFeatures,] + writeData(wb,'Selected_Features',report,rowNames=T,startRow=(k+2+nControls)*(nSelectedFeatures+3)+1,keepNA=F) + writeData(wb,'Selected_Features','Feature',startRow=(k+2+nControls)*(nSelectedFeatures+3)+1) + addStyle(wb,'Selected_Features',style2,rows=(k+2+nControls)*(nSelectedFeatures+3)+1,cols=1:200) + } else { + report=report[selectedFeatures,] + writeData(wb,'Selected_Features',report,rowNames=T,startRow=(k+2+nControls)*(nSelectedFeatures+3)+1,keepNA=F) + writeData(wb,'Selected_Features','Feature',startRow=(k+2+nControls)*(nSelectedFeatures+3)+1) + addStyle(wb,'Selected_Features',style2,rows=(k+2+nControls)*(nSelectedFeatures+3)+1,cols=1:200) + } + } + } + if (visual) + plotID=plotID-1 + options(warn=0) +} +#############################End Sheet "DE_Comparisons"############################# + +#############################Begin Sheet "Overall" (Part 2)############################# +if (dim(comparisonsSummary)[1]>1) { + comparisonsSummary=as.data.frame(comparisonsSummary[order(comparisonsSummary[,10],comparisonsSummary[,11],comparisonsSummary[,1],comparisonsSummary[,2]),]) + comparisonsSummary[,3:11]=apply(comparisonsSummary[,3:11],2,as.numeric) +} +if (dim(comparisonsSummary)[1]>0) { + writeData(wb,'Overall',comparisonsSummary,startCol=14,rowNames=F,borders='surrounding') + writeComment(wb,'Overall',14,1,createComment('Control group name',visible=F)) + writeComment(wb,'Overall',15,1,createComment('Treatment group name',visible=F)) + writeComment(wb,'Overall',16,1,createComment('Number of features that are expressed only by control',visible=F)) + writeComment(wb,'Overall',17,1,createComment('Number of features that are expressed only by treatment',visible=F)) + writeComment(wb,'Overall',18,1,createComment('Number of features that are expressed by neither groups',visible=F)) + writeComment(wb,'Overall',19,1,createComment('Number of features that are expressed by both groups',visible=F)) + writeComment(wb,'Overall',20,1,createComment('Number of features that are significantly up-regulated by treatment',visible=F)) + writeComment(wb,'Overall',21,1,createComment('Number of features that are significantly down-regulated by treatment',visible=F)) + writeComment(wb,'Overall',22,1,createComment('Number of features that are significantly up- or down-regulated by treatment',visible=F)) +} +#############################End Sheet "Overall" (Part 2)############################# + +saveWorkbook(wb,outputFile,overwrite=F) +unlink('Rplots.pdf') + +#############################End Main############################# +message('Analysis completed.') +message('###################################################################') + diff --git a/V1.7.1/ReadMe.txt b/V1.7.1/ReadMe.txt new file mode 100755 index 0000000000000000000000000000000000000000..9aa68e06e3785fa9307fbdcb1391e0e235ed55a8 --- /dev/null +++ b/V1.7.1/ReadMe.txt @@ -0,0 +1,25 @@ +To run this analysis tool, first make a copy of the data template, read the instructions in the template and make necessary changes. + +To run the tool on BioHPC @ UTSW: Open a BioHPC web visualization node and run the following from command line: + +sh /project/CRI/shared/Metabolomics/run_analysis.sh input_path/input_file_name.xlsm output_path/output_file_name.xlsx optional_BioHPC_queue_name #input_file is the copy of the data template with your data. output_file is the analysis result. + +######################################################################### +Change Log: +#v1.7.1, 12/04/2019: Figures were saved as both .ps and .png files. Euler plot was disabled for >6 samples. Samples / features can be clustered or not on the heatmaps. +#v1.7.0, 10/22/2019: Figures were saved as .ps files in a Figure folder. Correlation plot was genrated with the old method for better speed. Euler plot was disabled for >10 samples. Y axis of volcano plot could be raw or adjusted p-values. DESeq was renamed RLE as I would implement the DESeq differential expression analysis in the future. RLE is the same method used by DESeq but I slightly modified it to allow for zeros, missing values and / or values below the value filter. Due to this reason I couldn't use the DESeq normalization directly. +#v1.6.9, 10/4/2019: A bug regarding cpglm was fixed. The bug caused an error in cpglm fitting but the class(fit) was still 'cpglm' and fit$converged was still TRUE. Visualization option was added. Version number checkup was added for the data template. Multiple other minor improvements. +#v1.6.8, 09/19/2019: A bug regarding featureDescription was fixed. The bug was present only when data format was "table with feature description" and samples were excluded which happened to cause some features #v1.6.7, 08/29/2019: Correlation plot was enchanced. Euler diagram was added. Software version number was reported in the result file. +#v1.6.6, 08/28/2019: Data table with feature description column was enabled, minor bugs fixed, and volcano plot colors customized. +#v1.6.5, 08/27/2019: A bug regarding number of features per sample when data imputation = "NA" was fixed. +#v1.6.4, 08/08/2019: A bug regarding groupMeanLowerBound was fixed. Definition and default value of groupCountFilter were modified. Data imputation was implemented. Missing values were allowed. +#v1.6.3, 08/05/2019: a bug regarding nSamples and formula string generation (only when technical replicates are present) was fixed. +#v1.6.2, 07/24/2019: a bug regarding normalization size factors was fixed. The bug caused size factors to be applied to incorrect samples. If you have used versions before this fix, please rerun your analysis using this fixed version. +#v1.6.1, 07/03/2019: a bug regarding numerical batch IDs was fixed. +#v1.6, 06/24/2019: Multiple normalization methods; visualization plots; sample exclusion; feature exclusion; cpglm and glm for statistical tests; +#v1.5, 03/13/2019: If logTransform enabled, data=log2(data+logOffset) before statistical tests; p-values are FDR-adjusted; Bugs fixed; normalization enabled; +#v1.4, 01/18/2019: Added functionality. (1) Averaging technical replicates. (2) Auto-IDs. +#v1.1-1.3: Bugs fixed. +#v1.0: Basic analysis including counting features and t-tests. +######################################################################### + diff --git a/V1.7.1/data_template_1.7.1.xlsm b/V1.7.1/data_template_1.7.1.xlsm new file mode 100755 index 0000000000000000000000000000000000000000..67dd2c1d744578a2f2ad102812218d7fbe42a7af Binary files /dev/null and b/V1.7.1/data_template_1.7.1.xlsm differ diff --git a/V1.7.1/run_analysis.sh b/V1.7.1/run_analysis.sh new file mode 100755 index 0000000000000000000000000000000000000000..bb285c4c6a24acdc992addb3291db2ab9d83827a --- /dev/null +++ b/V1.7.1/run_analysis.sh @@ -0,0 +1,17 @@ +#!/bin/bash +if [[ $# < 2 ]] +then + echo "Usage:" + echo "sh this_file.sh input_path/input_file_name.xlsx output_path/output_file_name.xlsx optional_BioHPC_queue_name" + exit +fi +module load R/3.5.1-gccmkl +DIR=`dirname $0` +INPUT_FILE=$1 +OUTPUT_FILE=$2 +QUEUE=$3 +if [ "$QUEUE" == "" ] +then + QUEUE=256GB +fi +srun -p $QUEUE Rscript $DIR/ODA_1.7.1.R $INPUT_FILE $OUTPUT_FILE