@@ -96,8 +96,6 @@ setClass("SimpleParams",
 #'         \describe{
 #'             \item{\code{out.prob}}{Probability that a gene is an expression
 #'             outlier.}
-#'             \item{\code{out.loProb}}{Probability that an expression outlier
-#'             gene is lowly expressed.}
 #'             \item{\code{out.facLoc}}{Location (meanlog) parameter for the
 #'             expression outlier factor log-normal distribution.}
 #'             \item{\code{out.facScale}}{Scale (sdlog) parameter for the
@@ -182,7 +180,6 @@ setClass("SplatParams",
                    lib.loc = "numeric",
                    lib.scale = "numeric",
                    out.prob = "numeric",
-                   out.loProb = "numeric",
                    out.facLoc = "numeric",
                    out.facScale = "numeric",
                    de.prob = "numeric",
@@ -202,20 +199,19 @@ setClass("SplatParams",
          prototype = prototype(nGroups = 1,
                                groupCells = 100,
                                mean.rate = 0.3,
-                               mean.shape = 0.4,
-                               lib.loc = 10,
-                               lib.scale = 0.5,
-                               out.prob = 0.1,
-                               out.loProb = 0.5,
+                               mean.shape = 0.6,
+                               lib.loc = 11,
+                               lib.scale = 0.2,
+                               out.prob = 0.05,
                                out.facLoc = 4,
-                               out.facScale = 1,
+                               out.facScale = 0.5,
                                de.prob = 0.1,
                                de.downProb = 0.5,
                                de.facLoc = 4,
                                de.facScale = 1,
                                bcv.common = 0.1,
-                               bcv.df = 25,
-                               dropout.present = TRUE,
+                               bcv.df = 60,
+                               dropout.present = FALSE,
                                dropout.mid = 0,
                                dropout.shape = -1,
                                path.from = 0,
@@ -438,4 +434,4 @@ setClass("SCDDParams",
                                sd.range = c(1, 3),
                                modeFC = c(2, 3, 4),
                                varInflation = c(1, 1),
-                               condition = "condition"))
\ No newline at end of file
+                               condition = "condition"))
@@ -27,11 +27,6 @@ setValidity("SplatParams", function(object) {
                 lib.loc = checkNumber(v$lib.loc),
                 lib.scale = checkNumber(v$lib.scale, lower = 0),
                 out.prob = checkNumber(v$out.prob, lower = 0, upper = 1),
-                out.loProb = checkNumber(v$out.loProb, lower = 0, upper = 1),
-                out.facLoc = checkNumber(v$lib.loc),
-                out.facScale = checkNumber(v$lib.scale, lower = 0),
-                out.prob = checkNumber(v$out.prob, lower = 0, upper = 1),
-                out.loProb = checkNumber(v$out.loProb, lower = 0, upper = 1),
                 out.facLoc = checkNumber(v$out.facLoc),
                 out.facScale = checkNumber(v$out.facScale, lower = 0),
                 de.prob = checkNumeric(v$de.prob, lower = 0, upper = 1,
@@ -108,7 +103,6 @@ setMethod("show", "SplatParams", function(object) {
                "Library size:"   = c("(Location)"     = "lib.loc",
                                      "(Scale)"        = "lib.scale"),
                "Exprs outliers:" = c("(Probability)"  = "out.prob",
-                                     "(Lo Prob)"      = "out.loProb",
                                      "(Location)"     = "out.facLoc",
                                      "(Scale)"        = "out.facScale"),
                "Diff expr:"      = c("[Probability]"  = "de.prob",
@@ -59,19 +59,34 @@ splatEstimate.matrix <- function(counts, params = newSplatParams()) {
 #' Estimate Splat mean parameters
 #' Estimate rate and shape parameters for the gamma distribution used to
-#' simulate gene expression means using the 'moment matching estimation' method
-#' of \code{\link[fitdistrplus]{fitdist}}.
+#' simulate gene expression means.
 #' @param norm.counts library size normalised counts matrix.
 #' @param params SplatParams object to store estimated values in.
+#' @details
+#' Parameter for the gamma distribution are estimated by fitting the mean
+#' normalised counts using \code{\link[fitdistrplus]{fitdist}}. The 'maximum
+#' goodness-of-fit estimation' method is used to minimise the Cramer-von Mises
+#' distance. This can fail in some situations, in which case the 'method of
+#' moments estimation' method is used instead. Prior to fitting the means are
+#' winsorized by setting the top and bottom 10 percent of values to the 10th
+#' and 90th percentiles.
 #' @return SplatParams object with estimated values.
 splatEstMean <- function(norm.counts, params) {
     means <- rowMeans(norm.counts)
     means <- means[means != 0]
-    fit <- fitdistrplus::fitdist(means, "gamma", method = "mme")
+    means <- winsorize(means, q = 0.1)
+    fit <- try(fitdistrplus::fitdist(means, "gamma", method = "mge",
+                                     gof = "CvM"))
+    if (class(fit) == "try-error") {
+        warning("Goodness of fit failed, using Method of Moments")
+        fit <- fitdistrplus::fitdist(means, "gamma", method = "mme")
+    }
     params <- setParams(params, mean.shape = unname(fit$estimate["shape"]),
                         mean.rate = unname(fit$estimate["rate"]))
@@ -111,15 +126,12 @@ splatEstLib <- function(counts, params) {
 #' @details
 #' Expression outlier genes are detected using the Median Absolute Deviation
 #' (MAD) from median method. If the log2 mean expression of a gene is greater
-#' than two MADs from the median log2 mean expression it is designated as a
+#' than two MADs above the median log2 mean expression it is designated as an
 #' outlier. The proportion of outlier genes is used to estimate the outlier
-#' probability. The low outlier probability is estimated as the proportion of
-#' outlier genes that have a log2 mean less than the median log2 mean. Factors
-#' for each outlier gene are calculated by dividing mean expression by the
-#' median mean expression. A log-normal distribution is then fitted to these
-#' factors in order to estimate the outlier factor location and scale
-#' parameters. See \code{\link[fitdistrplus]{fitdist}} for details on the
-#' fitting.
+#' probability. Factors for each outlier gene are calculated by dividing mean
+#' expression by the median mean expression. A log-normal distribution is then
+#' fitted to these factors in order to estimate the outlier factor location and
+#' scale parameters using \code{\link[fitdistrplus]{fitdist}}.
 #' @return SplatParams object with estimated values.
 splatEstOutlier <- function(norm.counts, params) {
@@ -130,34 +142,42 @@ splatEstOutlier <- function(norm.counts, params) {
     med <- median(lmeans)
     mad <- mad(lmeans)
-    lo.bound <- med - 2 * mad
-    hi.bound <- med + 2 * mad
+    bound <- med + 2 * mad
+    outs <- which(lmeans > bound)
-    lo.outs <- which(lmeans < lo.bound)
-    hi.outs <- which(lmeans > hi.bound)
+    prob <- length(outs) / nrow(norm.counts)
-    prob <- (length(lo.outs) + length(hi.outs)) / nrow(norm.counts)
-    lo.prob <- length(lo.outs) / (length(lo.outs) + length(hi.outs))
+    params <- setParams(params, out.prob = prob)
-    facs <- means[c(lo.outs, hi.outs)] / median(means)
-    fit <- fitdistrplus::fitdist(facs, "lnorm")
+    if (length(outs) > 1) {
+        facs <- means[outs] / median(means)
+        fit <- fitdistrplus::fitdist(facs, "lnorm")
-    params <- setParams(params, out.prob = prob, out.loProb = lo.prob,
-                        out.facLoc = unname(fit$estimate["meanlog"]),
-                        out.facScale = unname(fit$estimate["sdlog"]))
+        params <- setParams(params,
+                            out.facLoc = unname(fit$estimate["meanlog"]),
+                            out.facScale = unname(fit$estimate["sdlog"]))
+    }
 @@ -165,7 +185,8 @@ splatEstBCV <- function(counts, params) {
-#' Parameters are estimated using the \code{estimateDisp} function in the
-#' \code{edgeR} package. Specifically the common dispersion and prior degrees
-#' of freedom. See \code{\link{estimateDisp}} for details.
+#' Parameters are estimated using the \code{\link[edgeR]{estimateDisp}} function
+#' in the \code{edgeR} package.
 #' @param counts counts matrix to estimate parameters from.
 #' @param params SplatParams object to store estimated values in.
+#' @details
+#' The \code{\link[edgeR]{estimateDisp}} function is used to estimate the common
+#' dispersion and prior degrees of freedom. See
+#' \code{\link[edgeR]{estimateDisp}} for details. When estimating parameters on
+#' simulated data we found a broadly linear relationship between the true
+#' underlying common dispersion and the \code{edgR} estimate, therefore we
+#' apply a small correction, \code{disp = 0.1 + 0.25 * edgeR.disp}.
 #' @return SplatParams object with estimated values.
 splatEstBCV <- function(counts, params) {
@@ -165,7 +185,8 @@ splatEstBCV <- function(counts, params) {
     design <- matrix(1, ncol(counts), 1)
     disps <- edgeR::estimateDisp(counts, design = design)
-    params <- setParams(params, bcv.common = disps$common.dispersion,
+    params <- setParams(params,
+                        bcv.common = 0.1 + 0.25 * disps$common.dispersion,
                         bcv.df = disps$prior.df)
@@ -224,4 +245,4 @@ splatEstDropout <- function(norm.counts, params) {
                         dropout.shape = shape)
\ No newline at end of file
@@ -280,7 +280,6 @@ splatSimGeneMeans <- function(sim, params) {
     mean.shape <- getParam(params, "mean.shape")
     mean.rate <- getParam(params, "mean.rate")
     out.prob <- getParam(params, "out.prob")
-    out.loProb <- getParam(params, "out.loProb")
     out.facLoc <- getParam(params, "out.facLoc")
     out.facScale <- getParam(params, "out.facScale")
@@ -288,7 +287,7 @@ splatSimGeneMeans <- function(sim, params) {
     base.means.gene <- rgamma(nGenes, shape = mean.shape, rate = mean.rate)
     # Add expression outliers
-    outlier.facs <- getLNormFactors(nGenes, out.prob, out.loProb, out.facLoc,
+    outlier.facs <- getLNormFactors(nGenes, out.prob, 0, out.facLoc,
     median.means.gene <- median(base.means.gene)
     outlier.means <- median.means.gene * outlier.facs
@@ -26,4 +26,27 @@ rbindMatched <- function(df1, df2) {
     combined <- rbind(df1[, common.names], df2[, common.names])
\ No newline at end of file
+#' Winsorize vector
+#' Set outliers in a numeric vector to a specified percentile.
+#' @param x Numeric vector to winsorize
+#' @param q Percentile to set from each end
+#' @return Winsorized numeric vector
+winsorize <- function(x, q) {
+    checkmate::check_numeric(x, any.missing = FALSE)
+    checkmate::check_number(q, lower = 0, upper = 1)
+    lohi <- stats::quantile(x, c(q, 1 - q), na.rm = TRUE)
+    if (diff(lohi) < 0) { lohi <- rev(lohi) }
+    x[!is.na(x) & x < lohi[1]] <- lohi[1]
+    x[!is.na(x) & x > lohi[2]] <- lohi[2]
+    return(x)
@@ -40,8 +40,6 @@ The Splatter simulation requires the following parameters:
             \item{\code{out.prob}}{Probability that a gene is an expression
-            \item{\code{out.loProb}}{Probability that an expression outlier
-            gene is lowly expressed.}
             \item{\code{out.facLoc}}{Location (meanlog) parameter for the
             expression outlier factor log-normal distribution.}
             \item{\code{out.facScale}}{Scale (sdlog) parameter for the
@@ -15,7 +15,14 @@ splatEstBCV(counts, params)
 SplatParams object with estimated values.
-Parameters are estimated using the \code{estimateDisp} function in the
-\code{edgeR} package. Specifically the common dispersion and prior degrees
-of freedom. See \code{\link{estimateDisp}} for details.
+Parameters are estimated using the \code{\link[edgeR]{estimateDisp}} function
+in the \code{edgeR} package.
+The \code{\link[edgeR]{estimateDisp}} function is used to estimate the common
+dispersion and prior degrees of freedom. See
+\code{\link[edgeR]{estimateDisp}} for details. When estimating parameters on
+simulated data we found a broadly linear relationship between the true
+underlying common dispersion and the \code{edgR} estimate, therefore we
+apply a small correction, \code{disp = 0.1 + 0.25 * edgeR.disp}.
@@ -16,6 +16,14 @@ SplatParams object with estimated values.
 Estimate rate and shape parameters for the gamma distribution used to
-simulate gene expression means using the 'moment matching estimation' method
-of \code{\link[fitdistrplus]{fitdist}}.
+simulate gene expression means.
+Parameter for the gamma distribution are estimated by fitting the mean
+normalised counts using \code{\link[fitdistrplus]{fitdist}}. The 'maximum
+goodness-of-fit estimation' method is used to minimise the Cramer-von Mises
+distance. This can fail in some situations, in which case the 'method of
+moments estimation' method is used instead. Prior to fitting the means are
+winsorized by setting the top and bottom 10 percent of values to the 10th
+and 90th percentiles.
@@ -21,13 +21,10 @@ median mean expression level.
 Expression outlier genes are detected using the Median Absolute Deviation
 (MAD) from median method. If the log2 mean expression of a gene is greater
-than two MADs from the median log2 mean expression it is designated as a
+than two MADs above the median log2 mean expression it is designated as an
 outlier. The proportion of outlier genes is used to estimate the outlier
-probability. The low outlier probability is estimated as the proportion of
-outlier genes that have a log2 mean less than the median log2 mean. Factors
-for each outlier gene are calculated by dividing mean expression by the
-median mean expression. A log-normal distribution is then fitted to these
-factors in order to estimate the outlier factor location and scale
-parameters. See \code{\link[fitdistrplus]{fitdist}} for details on the
+probability. Factors for each outlier gene are calculated by dividing mean
+expression by the median mean expression. A log-normal distribution is then
+fitted to these factors in order to estimate the outlier factor location and
+scale parameters using \code{\link[fitdistrplus]{fitdist}}.
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\title{Winsorize vector}
+winsorize(x, q)
+\item{x}{Numeric vector to winsorize}
+\item{q}{Percentile to set from each end}
+Winsorized numeric vector
+Set outliers in a numeric vector to a specified percentile.
@@ -102,8 +102,6 @@ The parameters required for the Splat simulation are briefly described here:
 * **Expression outlier parameters**
     * `out.prob` - Probability that a gene is an expression outlier.
-    * `out.loProb` - Probability that an expression outlier gene is lowly
-      expressed (other outliers are highly expressed).
     * `out.facLoc` - Location (meanlog) parameter for the expression outlier
       factor log-normal distribution.
     * `out.facScale` - Scale (sdlog) parameter for the expression outlier factor