Merge branch 'master' into devel

* master: Version 0.99.10 Remove out.loProb from vignette Update default Splat parameters Remove out.loProb parameter Update estimation procedure From: Luke Zappia <lazappi@users.noreply.github.com> git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/splatter@127213 bc3139a8-67e5-0310-9ffc-ced21a209358

Merge branch 'master' into devel
cc0bbdea · Luke Zappia · 32e53deb · cc0bbdea · cc0bbdea · cc0bbdea
Commit cc0bbdea authored 8 years ago by Luke Zappia
--- a/DESCRIPTION
+++ b/DESCRIPTION
 Package: splatter
 Type: Package
 Title: Simple Simulation of Single-cell RNA Sequencing Data
-Version: 0.99.9
+Version: 0.99.10
-Date: 2017-02-02
+Date: 2017-03-07
 Author: Luke Zappia
 Authors@R:
    c(person("Luke", "Zappia", role = c("aut", "cre"),

--- a/NEWS.md
+++ b/NEWS.md
+# 0.99.10
+* Improve Splat estimation procedure
+* Update default Splat parameters
+* Remove out.loProb Splat parameter
+* Add MeanZeros plot to compareSCESets
 # 0.99.9
 * Add addGeneLengths function

--- a/R/AllClasses.R
+++ b/R/AllClasses.R
@@ -96,8 +96,6 @@ setClass("SimpleParams",
 #'         \describe{
 #'             \item{\code{out.prob}}{Probability that a gene is an expression
 #'             outlier.}
-#'             \item{\code{out.loProb}}{Probability that an expression outlier
-#'             gene is lowly expressed.}
 #'             \item{\code{out.facLoc}}{Location (meanlog) parameter for the
 #'             expression outlier factor log-normal distribution.}
 #'             \item{\code{out.facScale}}{Scale (sdlog) parameter for the
@@ -182,7 +180,6 @@ setClass("SplatParams",
                   lib.loc = "numeric",
                   lib.scale = "numeric",
                   out.prob = "numeric",
-                   out.loProb = "numeric",
                   out.facLoc = "numeric",
                   out.facScale = "numeric",
                   de.prob = "numeric",
@@ -202,20 +199,19 @@ setClass("SplatParams",
         prototype = prototype(nGroups = 1,
                               groupCells = 100,
                               mean.rate = 0.3,
-                               mean.shape = 0.4,
+                               mean.shape = 0.6,
-                               lib.loc = 10,
+                               lib.loc = 11,
-                               lib.scale = 0.5,
+                               lib.scale = 0.2,
-                               out.prob = 0.1,
+                               out.prob = 0.05,
-                               out.loProb = 0.5,
                               out.facLoc = 4,
-                               out.facScale = 1,
+                               out.facScale = 0.5,
                               de.prob = 0.1,
                               de.downProb = 0.5,
                               de.facLoc = 4,
                               de.facScale = 1,
                               bcv.common = 0.1,
-                               bcv.df = 25,
+                               bcv.df = 60,
-                               dropout.present = TRUE,
+                               dropout.present = FALSE,
                               dropout.mid = 0,
                               dropout.shape = -1,
                               path.from = 0,
@@ -438,4 +434,4 @@ setClass("SCDDParams",
                               sd.range = c(1, 3),
                               modeFC = c(2, 3, 4),
                               varInflation = c(1, 1),
                               condition = "condition"))
\ No newline at end of file
--- a/R/SplatParams-methods.R
+++ b/R/SplatParams-methods.R
@@ -27,11 +27,6 @@ setValidity("SplatParams", function(object) {
                lib.loc = checkNumber(v$lib.loc),
                lib.scale = checkNumber(v$lib.scale, lower = 0),
                out.prob = checkNumber(v$out.prob, lower = 0, upper = 1),
-                out.loProb = checkNumber(v$out.loProb, lower = 0, upper = 1),
-                out.facLoc = checkNumber(v$lib.loc),
-                out.facScale = checkNumber(v$lib.scale, lower = 0),
-                out.prob = checkNumber(v$out.prob, lower = 0, upper = 1),
-                out.loProb = checkNumber(v$out.loProb, lower = 0, upper = 1),
                out.facLoc = checkNumber(v$out.facLoc),
                out.facScale = checkNumber(v$out.facScale, lower = 0),
                de.prob = checkNumeric(v$de.prob, lower = 0, upper = 1,
@@ -108,7 +103,6 @@ setMethod("show", "SplatParams", function(object) {
               "Library size:"   = c("(Location)"     = "lib.loc",
                                     "(Scale)"        = "lib.scale"),
               "Exprs outliers:" = c("(Probability)"  = "out.prob",
-                                     "(Lo Prob)"      = "out.loProb",
                                     "(Location)"     = "out.facLoc",
                                     "(Scale)"        = "out.facScale"),
               "Diff expr:"      = c("[Probability]"  = "de.prob",

--- a/R/compare.R
+++ b/R/compare.R
@@ -24,6 +24,8 @@
 #'             that is zero.}
 #'             \item{\code{ZerosCell}}{Boxplot of the percentage of each cell
 #'             that is zero.}
+#'             \item{\code{MeanZeros}}{Scatter plot with fitted lines showing
+#'             the mean-dropout relationship.}
 #'     }
 #'   }
 #' }
@@ -95,8 +97,8 @@ compareSCESets <- function(sces) {
    mean.var <- ggplot(fData.all,
                       aes_string(x = "mean_log_cpm", y = "var_log_cpm",
-                                  olour = "Dataset", fill = "Dataset")) +
+                                  colour = "Dataset", fill = "Dataset")) +
-        geom_point() +
+        geom_point(alpha = 0.2) +
        geom_smooth() +
        xlab(expression(paste("Mean ", log[2], "(CPM + 1)"))) +
        ylab(expression(paste("Variance ", log[2], "(CPM + 1)"))) +
@@ -130,6 +132,16 @@ compareSCESets <- function(sces) {
        ggtitle("Distribution of zeros per cell") +
        theme_minimal()
+    mean.zeros <- ggplot(fData.all,
+                         aes_string(x = "mean_log_cpm", y = "pct_dropout",
+                                    colour = "Dataset", fill = "Dataset")) +
+        geom_point(alpha = 0.2) +
+        geom_smooth() +
+        xlab(expression(paste("Mean ", log[2], "(CPM + 1)"))) +
+        ylab(expression(paste("Percentage zeros"))) +
+        ggtitle("Mean-dropout relationship") +
+        theme_minimal()
    comparison <- list(FeatureData = fData.all,
                       PhenoData = pData.all,
                       Plots = list(Means = means,
@@ -137,7 +149,8 @@ compareSCESets <- function(sces) {
                                    MeanVar = mean.var,
                                    LibrarySizes = libs,
                                    ZerosGene = z.gene,
-                                    ZerosCell = z.cell))
+                                    ZerosCell = z.cell,
+                                    MeanZeros = mean.zeros))
    return(comparison)
 }
--- a/R/splat-estimate.R
+++ b/R/splat-estimate.R
@@ -59,19 +59,34 @@ splatEstimate.matrix <- function(counts, params = newSplatParams()) {
 #' Estimate Splat mean parameters
 #'
 #' Estimate rate and shape parameters for the gamma distribution used to
-#' simulate gene expression means using the 'moment matching estimation' method
+#' simulate gene expression means.
-#' of \code{\link[fitdistrplus]{fitdist}}.
 #'
 #' @param norm.counts library size normalised counts matrix.
 #' @param params SplatParams object to store estimated values in.
 #'
+#' @details
+#' Parameter for the gamma distribution are estimated by fitting the mean
+#' normalised counts using \code{\link[fitdistrplus]{fitdist}}. The 'maximum
+#' goodness-of-fit estimation' method is used to minimise the Cramer-von Mises
+#' distance. This can fail in some situations, in which case the 'method of
+#' moments estimation' method is used instead. Prior to fitting the means are
+#' winsorized by setting the top and bottom 10 percent of values to the 10th
+#' and 90th percentiles.
+#'
 #' @return SplatParams object with estimated values.
 splatEstMean <- function(norm.counts, params) {
    means <- rowMeans(norm.counts)
    means <- means[means != 0]
-    fit <- fitdistrplus::fitdist(means, "gamma", method = "mme")
+    means <- winsorize(means, q = 0.1)
+    fit <- try(fitdistrplus::fitdist(means, "gamma", method = "mge",
+                                     gof = "CvM"))
+    if (class(fit) == "try-error") {
+        warning("Goodness of fit failed, using Method of Moments")
+        fit <- fitdistrplus::fitdist(means, "gamma", method = "mme")
+    }
    params <- setParams(params, mean.shape = unname(fit$estimate["shape"]),
                        mean.rate = unname(fit$estimate["rate"]))
@@ -111,15 +126,12 @@ splatEstLib <- function(counts, params) {
 #' @details
 #' Expression outlier genes are detected using the Median Absolute Deviation
 #' (MAD) from median method. If the log2 mean expression of a gene is greater
-#' than two MADs from the median log2 mean expression it is designated as a
+#' than two MADs above the median log2 mean expression it is designated as an
 #' outlier. The proportion of outlier genes is used to estimate the outlier
-#' probability. The low outlier probability is estimated as the proportion of
+#' probability. Factors for each outlier gene are calculated by dividing mean
-#' outlier genes that have a log2 mean less than the median log2 mean. Factors
+#' expression by the median mean expression. A log-normal distribution is then
-#' for each outlier gene are calculated by dividing mean expression by the
+#' fitted to these factors in order to estimate the outlier factor location and
-#' median mean expression. A log-normal distribution is then fitted to these
+#' scale parameters using \code{\link[fitdistrplus]{fitdist}}.
-#' factors in order to estimate the outlier factor location and scale
-#' parameters. See \code{\link[fitdistrplus]{fitdist}} for details on the
-#' fitting.
 #'
 #' @return SplatParams object with estimated values.
 splatEstOutlier <- function(norm.counts, params) {
@@ -130,34 +142,42 @@ splatEstOutlier <- function(norm.counts, params) {
    med <- median(lmeans)
    mad <- mad(lmeans)
-    lo.bound <- med - 2 * mad
+    bound <- med + 2 * mad
-    hi.bound <- med + 2 * mad
+    outs <- which(lmeans > bound)
-    lo.outs <- which(lmeans < lo.bound)
+    prob <- length(outs) / nrow(norm.counts)
-    hi.outs <- which(lmeans > hi.bound)
-    prob <- (length(lo.outs) + length(hi.outs)) / nrow(norm.counts)
+    params <- setParams(params, out.prob = prob)
-    lo.prob <- length(lo.outs) / (length(lo.outs) + length(hi.outs))
-    facs <- means[c(lo.outs, hi.outs)] / median(means)
+    if (length(outs) > 1) {
-    fit <- fitdistrplus::fitdist(facs, "lnorm")
+        facs <- means[outs] / median(means)
+        fit <- fitdistrplus::fitdist(facs, "lnorm")
-    params <- setParams(params, out.prob = prob, out.loProb = lo.prob,
+        params <- setParams(params,
-                        out.facLoc = unname(fit$estimate["meanlog"]),
+                            out.facLoc = unname(fit$estimate["meanlog"]),
-                        out.facScale = unname(fit$estimate["sdlog"]))
+                            out.facScale = unname(fit$estimate["sdlog"]))
+    }
    return(params)
 }
 #' Estimate Splat Biological Coefficient of Variation parameters
 #'
-#' Parameters are estimated using the \code{estimateDisp} function in the
+#' Parameters are estimated using the \code{\link[edgeR]{estimateDisp}} function
-#' \code{edgeR} package. Specifically the common dispersion and prior degrees
+#' in the \code{edgeR} package.
-#' of freedom. See \code{\link{estimateDisp}} for details.
 #'
 #' @param counts counts matrix to estimate parameters from.
 #' @param params SplatParams object to store estimated values in.
 #'
+#' @details
+#' The \code{\link[edgeR]{estimateDisp}} function is used to estimate the common
+#' dispersion and prior degrees of freedom. See
+#' \code{\link[edgeR]{estimateDisp}} for details. When estimating parameters on
+#' simulated data we found a broadly linear relationship between the true
+#' underlying common dispersion and the \code{edgR} estimate, therefore we
+#' apply a small correction, \code{disp = 0.1 + 0.25 * edgeR.disp}.
+#'
 #' @return SplatParams object with estimated values.
 splatEstBCV <- function(counts, params) {
@@ -165,7 +185,8 @@ splatEstBCV <- function(counts, params) {
    design <- matrix(1, ncol(counts), 1)
    disps <- edgeR::estimateDisp(counts, design = design)
-    params <- setParams(params, bcv.common = disps$common.dispersion,
+    params <- setParams(params,
+                        bcv.common = 0.1 + 0.25 * disps$common.dispersion,
                        bcv.df = disps$prior.df)
    return(params)
@@ -224,4 +245,4 @@ splatEstDropout <- function(norm.counts, params) {
                        dropout.shape = shape)
    return(params)
 }
\ No newline at end of file
--- a/R/splat-simulate.R
+++ b/R/splat-simulate.R
@@ -280,7 +280,6 @@ splatSimGeneMeans <- function(sim, params) {
    mean.shape <- getParam(params, "mean.shape")
    mean.rate <- getParam(params, "mean.rate")
    out.prob <- getParam(params, "out.prob")
-    out.loProb <- getParam(params, "out.loProb")
    out.facLoc <- getParam(params, "out.facLoc")
    out.facScale <- getParam(params, "out.facScale")
@@ -288,7 +287,7 @@ splatSimGeneMeans <- function(sim, params) {
    base.means.gene <- rgamma(nGenes, shape = mean.shape, rate = mean.rate)
    # Add expression outliers
-    outlier.facs <- getLNormFactors(nGenes, out.prob, out.loProb, out.facLoc,
+    outlier.facs <- getLNormFactors(nGenes, out.prob, 0, out.facLoc,
                                    out.facScale)
    median.means.gene <- median(base.means.gene)
    outlier.means <- median.means.gene * outlier.facs

--- a/R/utils.R
+++ b/R/utils.R
@@ -26,4 +26,27 @@ rbindMatched <- function(df1, df2) {
    combined <- rbind(df1[, common.names], df2[, common.names])
    return(combined)
 }
\ No newline at end of file
+#' Winsorize vector
+#'
+#' Set outliers in a numeric vector to a specified percentile.
+#'
+#' @param x Numeric vector to winsorize
+#' @param q Percentile to set from each end
+#'
+#' @return Winsorized numeric vector
+winsorize <- function(x, q) {
+    checkmate::check_numeric(x, any.missing = FALSE)
+    checkmate::check_number(q, lower = 0, upper = 1)
+    lohi <- stats::quantile(x, c(q, 1 - q), na.rm = TRUE)
+    if (diff(lohi) < 0) { lohi <- rev(lohi) }
+    x[!is.na(x) & x < lohi[1]] <- lohi[1]
+    x[!is.na(x) & x > lohi[2]] <- lohi[2]
+    return(x)
+}
--- a/man/SplatParams.Rd
+++ b/man/SplatParams.Rd
@@ -40,8 +40,6 @@ The Splatter simulation requires the following parameters:
        \describe{
            \item{\code{out.prob}}{Probability that a gene is an expression
            outlier.}
-            \item{\code{out.loProb}}{Probability that an expression outlier
-            gene is lowly expressed.}
            \item{\code{out.facLoc}}{Location (meanlog) parameter for the
            expression outlier factor log-normal distribution.}
            \item{\code{out.facScale}}{Scale (sdlog) parameter for the

--- a/man/compareSCESets.Rd
+++ b/man/compareSCESets.Rd
@@ -35,6 +35,8 @@ The return list has three items:
            that is zero.}
            \item{\code{ZerosCell}}{Boxplot of the percentage of each cell
            that is zero.}
+            \item{\code{MeanZeros}}{Scatter plot with fitted lines showing
+            the mean-dropout relationship.}
    }
  }
 }

--- a/man/splatEstBCV.Rd
+++ b/man/splatEstBCV.Rd
@@ -15,7 +15,14 @@ splatEstBCV(counts, params)
 SplatParams object with estimated values.
 }
 \description{
-Parameters are estimated using the \code{estimateDisp} function in the
+Parameters are estimated using the \code{\link[edgeR]{estimateDisp}} function
-\code{edgeR} package. Specifically the common dispersion and prior degrees
+in the \code{edgeR} package.
-of freedom. See \code{\link{estimateDisp}} for details.
+}
+\details{
+The \code{\link[edgeR]{estimateDisp}} function is used to estimate the common
+dispersion and prior degrees of freedom. See
+\code{\link[edgeR]{estimateDisp}} for details. When estimating parameters on
+simulated data we found a broadly linear relationship between the true
+underlying common dispersion and the \code{edgR} estimate, therefore we
+apply a small correction, \code{disp = 0.1 + 0.25 * edgeR.disp}.
 }
--- a/man/splatEstMean.Rd
+++ b/man/splatEstMean.Rd
@@ -16,6 +16,14 @@ SplatParams object with estimated values.
 }
 \description{
 Estimate rate and shape parameters for the gamma distribution used to
-simulate gene expression means using the 'moment matching estimation' method
+simulate gene expression means.
-of \code{\link[fitdistrplus]{fitdist}}.
+}
+\details{
+Parameter for the gamma distribution are estimated by fitting the mean
+normalised counts using \code{\link[fitdistrplus]{fitdist}}. The 'maximum
+goodness-of-fit estimation' method is used to minimise the Cramer-von Mises
+distance. This can fail in some situations, in which case the 'method of
+moments estimation' method is used instead. Prior to fitting the means are
+winsorized by setting the top and bottom 10 percent of values to the 10th
+and 90th percentiles.
 }
--- a/man/splatEstOutlier.Rd
+++ b/man/splatEstOutlier.Rd
@@ -21,13 +21,10 @@ median mean expression level.
 \details{
 Expression outlier genes are detected using the Median Absolute Deviation
 (MAD) from median method. If the log2 mean expression of a gene is greater
-than two MADs from the median log2 mean expression it is designated as a
+than two MADs above the median log2 mean expression it is designated as an
 outlier. The proportion of outlier genes is used to estimate the outlier
-probability. The low outlier probability is estimated as the proportion of
+probability. Factors for each outlier gene are calculated by dividing mean
-outlier genes that have a log2 mean less than the median log2 mean. Factors
+expression by the median mean expression. A log-normal distribution is then
-for each outlier gene are calculated by dividing mean expression by the
+fitted to these factors in order to estimate the outlier factor location and
-median mean expression. A log-normal distribution is then fitted to these
+scale parameters using \code{\link[fitdistrplus]{fitdist}}.
-factors in order to estimate the outlier factor location and scale
-parameters. See \code{\link[fitdistrplus]{fitdist}} for details on the
-fitting.
 }
--- a/man/winsorize.Rd
+++ b/man/winsorize.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{winsorize}
+\alias{winsorize}
+\title{Winsorize vector}
+\usage{
+winsorize(x, q)
+}
+\arguments{
+\item{x}{Numeric vector to winsorize}
+\item{q}{Percentile to set from each end}
+}
+\value{
+Winsorized numeric vector
+}
+\description{
+Set outliers in a numeric vector to a specified percentile.
+}
--- a/vignettes/splatter.Rmd
+++ b/vignettes/splatter.Rmd
@@ -102,8 +102,6 @@ The parameters required for the Splat simulation are briefly described here:
      distribution.
 * **Expression outlier parameters**
    * `out.prob` - Probability that a gene is an expression outlier.
-    * `out.loProb` - Probability that an expression outlier gene is lowly
-      expressed (other outliers are highly expressed).
    * `out.facLoc` - Location (meanlog) parameter for the expression outlier
      factor log-normal distribution.
    * `out.facScale` - Scale (sdlog) parameter for the expression outlier factor