From cc0bbdeaeb0ea4bd9f4c6c8d1952b9f7f288abee Mon Sep 17 00:00:00 2001 From: Luke Zappia <luke.zappia@mcri.edu.au> Date: Tue, 7 Mar 2017 03:26:32 +0000 Subject: [PATCH] Merge branch 'master' into devel * master: Version 0.99.10 Remove out.loProb from vignette Update default Splat parameters Remove out.loProb parameter Update estimation procedure From: Luke Zappia <lazappi@users.noreply.github.com> git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/splatter@127213 bc3139a8-67e5-0310-9ffc-ced21a209358 --- DESCRIPTION | 4 +-- NEWS.md | 7 ++++ R/AllClasses.R | 20 +++++------ R/SplatParams-methods.R | 6 ---- R/compare.R | 19 +++++++++-- R/splat-estimate.R | 75 ++++++++++++++++++++++++++--------------- R/splat-simulate.R | 3 +- R/utils.R | 25 +++++++++++++- man/SplatParams.Rd | 2 -- man/compareSCESets.Rd | 2 ++ man/splatEstBCV.Rd | 13 +++++-- man/splatEstMean.Rd | 12 +++++-- man/splatEstOutlier.Rd | 13 +++---- man/winsorize.Rd | 19 +++++++++++ vignettes/splatter.Rmd | 2 -- 15 files changed, 152 insertions(+), 70 deletions(-) create mode 100644 man/winsorize.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 530d696..3ef758f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: splatter Type: Package Title: Simple Simulation of Single-cell RNA Sequencing Data -Version: 0.99.9 -Date: 2017-02-02 +Version: 0.99.10 +Date: 2017-03-07 Author: Luke Zappia Authors@R: c(person("Luke", "Zappia", role = c("aut", "cre"), diff --git a/NEWS.md b/NEWS.md index 43207e0..9480f91 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# 0.99.10 + +* Improve Splat estimation procedure +* Update default Splat parameters +* Remove out.loProb Splat parameter +* Add MeanZeros plot to compareSCESets + # 0.99.9 * Add addGeneLengths function diff --git a/R/AllClasses.R b/R/AllClasses.R index c8066bd..568b391 100644 --- a/R/AllClasses.R +++ b/R/AllClasses.R @@ -96,8 +96,6 @@ setClass("SimpleParams", #' \describe{ #' \item{\code{out.prob}}{Probability that a gene is an expression #' outlier.} -#' \item{\code{out.loProb}}{Probability that an expression outlier -#' gene is lowly expressed.} #' \item{\code{out.facLoc}}{Location (meanlog) parameter for the #' expression outlier factor log-normal distribution.} #' \item{\code{out.facScale}}{Scale (sdlog) parameter for the @@ -182,7 +180,6 @@ setClass("SplatParams", lib.loc = "numeric", lib.scale = "numeric", out.prob = "numeric", - out.loProb = "numeric", out.facLoc = "numeric", out.facScale = "numeric", de.prob = "numeric", @@ -202,20 +199,19 @@ setClass("SplatParams", prototype = prototype(nGroups = 1, groupCells = 100, mean.rate = 0.3, - mean.shape = 0.4, - lib.loc = 10, - lib.scale = 0.5, - out.prob = 0.1, - out.loProb = 0.5, + mean.shape = 0.6, + lib.loc = 11, + lib.scale = 0.2, + out.prob = 0.05, out.facLoc = 4, - out.facScale = 1, + out.facScale = 0.5, de.prob = 0.1, de.downProb = 0.5, de.facLoc = 4, de.facScale = 1, bcv.common = 0.1, - bcv.df = 25, - dropout.present = TRUE, + bcv.df = 60, + dropout.present = FALSE, dropout.mid = 0, dropout.shape = -1, path.from = 0, @@ -438,4 +434,4 @@ setClass("SCDDParams", sd.range = c(1, 3), modeFC = c(2, 3, 4), varInflation = c(1, 1), - condition = "condition")) \ No newline at end of file + condition = "condition")) diff --git a/R/SplatParams-methods.R b/R/SplatParams-methods.R index e3f773d..07f9916 100644 --- a/R/SplatParams-methods.R +++ b/R/SplatParams-methods.R @@ -27,11 +27,6 @@ setValidity("SplatParams", function(object) { lib.loc = checkNumber(v$lib.loc), lib.scale = checkNumber(v$lib.scale, lower = 0), out.prob = checkNumber(v$out.prob, lower = 0, upper = 1), - out.loProb = checkNumber(v$out.loProb, lower = 0, upper = 1), - out.facLoc = checkNumber(v$lib.loc), - out.facScale = checkNumber(v$lib.scale, lower = 0), - out.prob = checkNumber(v$out.prob, lower = 0, upper = 1), - out.loProb = checkNumber(v$out.loProb, lower = 0, upper = 1), out.facLoc = checkNumber(v$out.facLoc), out.facScale = checkNumber(v$out.facScale, lower = 0), de.prob = checkNumeric(v$de.prob, lower = 0, upper = 1, @@ -108,7 +103,6 @@ setMethod("show", "SplatParams", function(object) { "Library size:" = c("(Location)" = "lib.loc", "(Scale)" = "lib.scale"), "Exprs outliers:" = c("(Probability)" = "out.prob", - "(Lo Prob)" = "out.loProb", "(Location)" = "out.facLoc", "(Scale)" = "out.facScale"), "Diff expr:" = c("[Probability]" = "de.prob", diff --git a/R/compare.R b/R/compare.R index 82f2658..989417a 100644 --- a/R/compare.R +++ b/R/compare.R @@ -24,6 +24,8 @@ #' that is zero.} #' \item{\code{ZerosCell}}{Boxplot of the percentage of each cell #' that is zero.} +#' \item{\code{MeanZeros}}{Scatter plot with fitted lines showing +#' the mean-dropout relationship.} #' } #' } #' } @@ -95,8 +97,8 @@ compareSCESets <- function(sces) { mean.var <- ggplot(fData.all, aes_string(x = "mean_log_cpm", y = "var_log_cpm", - olour = "Dataset", fill = "Dataset")) + - geom_point() + + colour = "Dataset", fill = "Dataset")) + + geom_point(alpha = 0.2) + geom_smooth() + xlab(expression(paste("Mean ", log[2], "(CPM + 1)"))) + ylab(expression(paste("Variance ", log[2], "(CPM + 1)"))) + @@ -130,6 +132,16 @@ compareSCESets <- function(sces) { ggtitle("Distribution of zeros per cell") + theme_minimal() + mean.zeros <- ggplot(fData.all, + aes_string(x = "mean_log_cpm", y = "pct_dropout", + colour = "Dataset", fill = "Dataset")) + + geom_point(alpha = 0.2) + + geom_smooth() + + xlab(expression(paste("Mean ", log[2], "(CPM + 1)"))) + + ylab(expression(paste("Percentage zeros"))) + + ggtitle("Mean-dropout relationship") + + theme_minimal() + comparison <- list(FeatureData = fData.all, PhenoData = pData.all, Plots = list(Means = means, @@ -137,7 +149,8 @@ compareSCESets <- function(sces) { MeanVar = mean.var, LibrarySizes = libs, ZerosGene = z.gene, - ZerosCell = z.cell)) + ZerosCell = z.cell, + MeanZeros = mean.zeros)) return(comparison) } diff --git a/R/splat-estimate.R b/R/splat-estimate.R index 69ddec5..d7a070e 100644 --- a/R/splat-estimate.R +++ b/R/splat-estimate.R @@ -59,19 +59,34 @@ splatEstimate.matrix <- function(counts, params = newSplatParams()) { #' Estimate Splat mean parameters #' #' Estimate rate and shape parameters for the gamma distribution used to -#' simulate gene expression means using the 'moment matching estimation' method -#' of \code{\link[fitdistrplus]{fitdist}}. +#' simulate gene expression means. #' #' @param norm.counts library size normalised counts matrix. #' @param params SplatParams object to store estimated values in. #' +#' @details +#' Parameter for the gamma distribution are estimated by fitting the mean +#' normalised counts using \code{\link[fitdistrplus]{fitdist}}. The 'maximum +#' goodness-of-fit estimation' method is used to minimise the Cramer-von Mises +#' distance. This can fail in some situations, in which case the 'method of +#' moments estimation' method is used instead. Prior to fitting the means are +#' winsorized by setting the top and bottom 10 percent of values to the 10th +#' and 90th percentiles. +#' #' @return SplatParams object with estimated values. splatEstMean <- function(norm.counts, params) { means <- rowMeans(norm.counts) means <- means[means != 0] - fit <- fitdistrplus::fitdist(means, "gamma", method = "mme") + means <- winsorize(means, q = 0.1) + + fit <- try(fitdistrplus::fitdist(means, "gamma", method = "mge", + gof = "CvM")) + if (class(fit) == "try-error") { + warning("Goodness of fit failed, using Method of Moments") + fit <- fitdistrplus::fitdist(means, "gamma", method = "mme") + } params <- setParams(params, mean.shape = unname(fit$estimate["shape"]), mean.rate = unname(fit$estimate["rate"])) @@ -111,15 +126,12 @@ splatEstLib <- function(counts, params) { #' @details #' Expression outlier genes are detected using the Median Absolute Deviation #' (MAD) from median method. If the log2 mean expression of a gene is greater -#' than two MADs from the median log2 mean expression it is designated as a +#' than two MADs above the median log2 mean expression it is designated as an #' outlier. The proportion of outlier genes is used to estimate the outlier -#' probability. The low outlier probability is estimated as the proportion of -#' outlier genes that have a log2 mean less than the median log2 mean. Factors -#' for each outlier gene are calculated by dividing mean expression by the -#' median mean expression. A log-normal distribution is then fitted to these -#' factors in order to estimate the outlier factor location and scale -#' parameters. See \code{\link[fitdistrplus]{fitdist}} for details on the -#' fitting. +#' probability. Factors for each outlier gene are calculated by dividing mean +#' expression by the median mean expression. A log-normal distribution is then +#' fitted to these factors in order to estimate the outlier factor location and +#' scale parameters using \code{\link[fitdistrplus]{fitdist}}. #' #' @return SplatParams object with estimated values. splatEstOutlier <- function(norm.counts, params) { @@ -130,34 +142,42 @@ splatEstOutlier <- function(norm.counts, params) { med <- median(lmeans) mad <- mad(lmeans) - lo.bound <- med - 2 * mad - hi.bound <- med + 2 * mad + bound <- med + 2 * mad + + outs <- which(lmeans > bound) - lo.outs <- which(lmeans < lo.bound) - hi.outs <- which(lmeans > hi.bound) + prob <- length(outs) / nrow(norm.counts) - prob <- (length(lo.outs) + length(hi.outs)) / nrow(norm.counts) - lo.prob <- length(lo.outs) / (length(lo.outs) + length(hi.outs)) + params <- setParams(params, out.prob = prob) - facs <- means[c(lo.outs, hi.outs)] / median(means) - fit <- fitdistrplus::fitdist(facs, "lnorm") + if (length(outs) > 1) { + facs <- means[outs] / median(means) + fit <- fitdistrplus::fitdist(facs, "lnorm") - params <- setParams(params, out.prob = prob, out.loProb = lo.prob, - out.facLoc = unname(fit$estimate["meanlog"]), - out.facScale = unname(fit$estimate["sdlog"])) + params <- setParams(params, + out.facLoc = unname(fit$estimate["meanlog"]), + out.facScale = unname(fit$estimate["sdlog"])) + } return(params) } #' Estimate Splat Biological Coefficient of Variation parameters #' -#' Parameters are estimated using the \code{estimateDisp} function in the -#' \code{edgeR} package. Specifically the common dispersion and prior degrees -#' of freedom. See \code{\link{estimateDisp}} for details. +#' Parameters are estimated using the \code{\link[edgeR]{estimateDisp}} function +#' in the \code{edgeR} package. #' #' @param counts counts matrix to estimate parameters from. #' @param params SplatParams object to store estimated values in. #' +#' @details +#' The \code{\link[edgeR]{estimateDisp}} function is used to estimate the common +#' dispersion and prior degrees of freedom. See +#' \code{\link[edgeR]{estimateDisp}} for details. When estimating parameters on +#' simulated data we found a broadly linear relationship between the true +#' underlying common dispersion and the \code{edgR} estimate, therefore we +#' apply a small correction, \code{disp = 0.1 + 0.25 * edgeR.disp}. +#' #' @return SplatParams object with estimated values. splatEstBCV <- function(counts, params) { @@ -165,7 +185,8 @@ splatEstBCV <- function(counts, params) { design <- matrix(1, ncol(counts), 1) disps <- edgeR::estimateDisp(counts, design = design) - params <- setParams(params, bcv.common = disps$common.dispersion, + params <- setParams(params, + bcv.common = 0.1 + 0.25 * disps$common.dispersion, bcv.df = disps$prior.df) return(params) @@ -224,4 +245,4 @@ splatEstDropout <- function(norm.counts, params) { dropout.shape = shape) return(params) -} \ No newline at end of file +} diff --git a/R/splat-simulate.R b/R/splat-simulate.R index 4e3efb0..bb54cb6 100644 --- a/R/splat-simulate.R +++ b/R/splat-simulate.R @@ -280,7 +280,6 @@ splatSimGeneMeans <- function(sim, params) { mean.shape <- getParam(params, "mean.shape") mean.rate <- getParam(params, "mean.rate") out.prob <- getParam(params, "out.prob") - out.loProb <- getParam(params, "out.loProb") out.facLoc <- getParam(params, "out.facLoc") out.facScale <- getParam(params, "out.facScale") @@ -288,7 +287,7 @@ splatSimGeneMeans <- function(sim, params) { base.means.gene <- rgamma(nGenes, shape = mean.shape, rate = mean.rate) # Add expression outliers - outlier.facs <- getLNormFactors(nGenes, out.prob, out.loProb, out.facLoc, + outlier.facs <- getLNormFactors(nGenes, out.prob, 0, out.facLoc, out.facScale) median.means.gene <- median(base.means.gene) outlier.means <- median.means.gene * outlier.facs diff --git a/R/utils.R b/R/utils.R index 73bb023..39e770d 100644 --- a/R/utils.R +++ b/R/utils.R @@ -26,4 +26,27 @@ rbindMatched <- function(df1, df2) { combined <- rbind(df1[, common.names], df2[, common.names]) return(combined) -} \ No newline at end of file +} + +#' Winsorize vector +#' +#' Set outliers in a numeric vector to a specified percentile. +#' +#' @param x Numeric vector to winsorize +#' @param q Percentile to set from each end +#' +#' @return Winsorized numeric vector +winsorize <- function(x, q) { + + checkmate::check_numeric(x, any.missing = FALSE) + checkmate::check_number(q, lower = 0, upper = 1) + + lohi <- stats::quantile(x, c(q, 1 - q), na.rm = TRUE) + + if (diff(lohi) < 0) { lohi <- rev(lohi) } + + x[!is.na(x) & x < lohi[1]] <- lohi[1] + x[!is.na(x) & x > lohi[2]] <- lohi[2] + + return(x) +} diff --git a/man/SplatParams.Rd b/man/SplatParams.Rd index 473507c..14bca2a 100644 --- a/man/SplatParams.Rd +++ b/man/SplatParams.Rd @@ -40,8 +40,6 @@ The Splatter simulation requires the following parameters: \describe{ \item{\code{out.prob}}{Probability that a gene is an expression outlier.} - \item{\code{out.loProb}}{Probability that an expression outlier - gene is lowly expressed.} \item{\code{out.facLoc}}{Location (meanlog) parameter for the expression outlier factor log-normal distribution.} \item{\code{out.facScale}}{Scale (sdlog) parameter for the diff --git a/man/compareSCESets.Rd b/man/compareSCESets.Rd index b66c525..14eb59d 100644 --- a/man/compareSCESets.Rd +++ b/man/compareSCESets.Rd @@ -35,6 +35,8 @@ The return list has three items: that is zero.} \item{\code{ZerosCell}}{Boxplot of the percentage of each cell that is zero.} + \item{\code{MeanZeros}}{Scatter plot with fitted lines showing + the mean-dropout relationship.} } } } diff --git a/man/splatEstBCV.Rd b/man/splatEstBCV.Rd index 9b66b17..5f81642 100644 --- a/man/splatEstBCV.Rd +++ b/man/splatEstBCV.Rd @@ -15,7 +15,14 @@ splatEstBCV(counts, params) SplatParams object with estimated values. } \description{ -Parameters are estimated using the \code{estimateDisp} function in the -\code{edgeR} package. Specifically the common dispersion and prior degrees -of freedom. See \code{\link{estimateDisp}} for details. +Parameters are estimated using the \code{\link[edgeR]{estimateDisp}} function +in the \code{edgeR} package. +} +\details{ +The \code{\link[edgeR]{estimateDisp}} function is used to estimate the common +dispersion and prior degrees of freedom. See +\code{\link[edgeR]{estimateDisp}} for details. When estimating parameters on +simulated data we found a broadly linear relationship between the true +underlying common dispersion and the \code{edgR} estimate, therefore we +apply a small correction, \code{disp = 0.1 + 0.25 * edgeR.disp}. } diff --git a/man/splatEstMean.Rd b/man/splatEstMean.Rd index e0892f9..c019f67 100644 --- a/man/splatEstMean.Rd +++ b/man/splatEstMean.Rd @@ -16,6 +16,14 @@ SplatParams object with estimated values. } \description{ Estimate rate and shape parameters for the gamma distribution used to -simulate gene expression means using the 'moment matching estimation' method -of \code{\link[fitdistrplus]{fitdist}}. +simulate gene expression means. +} +\details{ +Parameter for the gamma distribution are estimated by fitting the mean +normalised counts using \code{\link[fitdistrplus]{fitdist}}. The 'maximum +goodness-of-fit estimation' method is used to minimise the Cramer-von Mises +distance. This can fail in some situations, in which case the 'method of +moments estimation' method is used instead. Prior to fitting the means are +winsorized by setting the top and bottom 10 percent of values to the 10th +and 90th percentiles. } diff --git a/man/splatEstOutlier.Rd b/man/splatEstOutlier.Rd index 81e5685..543b6e6 100644 --- a/man/splatEstOutlier.Rd +++ b/man/splatEstOutlier.Rd @@ -21,13 +21,10 @@ median mean expression level. \details{ Expression outlier genes are detected using the Median Absolute Deviation (MAD) from median method. If the log2 mean expression of a gene is greater -than two MADs from the median log2 mean expression it is designated as a +than two MADs above the median log2 mean expression it is designated as an outlier. The proportion of outlier genes is used to estimate the outlier -probability. The low outlier probability is estimated as the proportion of -outlier genes that have a log2 mean less than the median log2 mean. Factors -for each outlier gene are calculated by dividing mean expression by the -median mean expression. A log-normal distribution is then fitted to these -factors in order to estimate the outlier factor location and scale -parameters. See \code{\link[fitdistrplus]{fitdist}} for details on the -fitting. +probability. Factors for each outlier gene are calculated by dividing mean +expression by the median mean expression. A log-normal distribution is then +fitted to these factors in order to estimate the outlier factor location and +scale parameters using \code{\link[fitdistrplus]{fitdist}}. } diff --git a/man/winsorize.Rd b/man/winsorize.Rd new file mode 100644 index 0000000..ddef66f --- /dev/null +++ b/man/winsorize.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{winsorize} +\alias{winsorize} +\title{Winsorize vector} +\usage{ +winsorize(x, q) +} +\arguments{ +\item{x}{Numeric vector to winsorize} + +\item{q}{Percentile to set from each end} +} +\value{ +Winsorized numeric vector +} +\description{ +Set outliers in a numeric vector to a specified percentile. +} diff --git a/vignettes/splatter.Rmd b/vignettes/splatter.Rmd index 8c1a815..03eb98f 100644 --- a/vignettes/splatter.Rmd +++ b/vignettes/splatter.Rmd @@ -102,8 +102,6 @@ The parameters required for the Splat simulation are briefly described here: distribution. * **Expression outlier parameters** * `out.prob` - Probability that a gene is an expression outlier. - * `out.loProb` - Probability that an expression outlier gene is lowly - expressed (other outliers are highly expressed). * `out.facLoc` - Location (meanlog) parameter for the expression outlier factor log-normal distribution. * `out.facScale` - Scale (sdlog) parameter for the expression outlier factor -- GitLab