From cc0bbdeaeb0ea4bd9f4c6c8d1952b9f7f288abee Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke.zappia@mcri.edu.au>
Date: Tue, 7 Mar 2017 03:26:32 +0000
Subject: [PATCH] Merge branch 'master' into devel

* master:
  Version 0.99.10
  Remove out.loProb from vignette
  Update default Splat parameters
  Remove out.loProb parameter
  Update estimation procedure

From: Luke Zappia <lazappi@users.noreply.github.com>

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/splatter@127213 bc3139a8-67e5-0310-9ffc-ced21a209358
---
 DESCRIPTION             |  4 +--
 NEWS.md                 |  7 ++++
 R/AllClasses.R          | 20 +++++------
 R/SplatParams-methods.R |  6 ----
 R/compare.R             | 19 +++++++++--
 R/splat-estimate.R      | 75 ++++++++++++++++++++++++++---------------
 R/splat-simulate.R      |  3 +-
 R/utils.R               | 25 +++++++++++++-
 man/SplatParams.Rd      |  2 --
 man/compareSCESets.Rd   |  2 ++
 man/splatEstBCV.Rd      | 13 +++++--
 man/splatEstMean.Rd     | 12 +++++--
 man/splatEstOutlier.Rd  | 13 +++----
 man/winsorize.Rd        | 19 +++++++++++
 vignettes/splatter.Rmd  |  2 --
 15 files changed, 152 insertions(+), 70 deletions(-)
 create mode 100644 man/winsorize.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index 530d696..3ef758f 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: splatter
 Type: Package
 Title: Simple Simulation of Single-cell RNA Sequencing Data
-Version: 0.99.9
-Date: 2017-02-02
+Version: 0.99.10
+Date: 2017-03-07
 Author: Luke Zappia
 Authors@R:
     c(person("Luke", "Zappia", role = c("aut", "cre"),
diff --git a/NEWS.md b/NEWS.md
index 43207e0..9480f91 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,10 @@
+# 0.99.10
+
+* Improve Splat estimation procedure
+* Update default Splat parameters
+* Remove out.loProb Splat parameter
+* Add MeanZeros plot to compareSCESets
+
 # 0.99.9
 
 * Add addGeneLengths function
diff --git a/R/AllClasses.R b/R/AllClasses.R
index c8066bd..568b391 100644
--- a/R/AllClasses.R
+++ b/R/AllClasses.R
@@ -96,8 +96,6 @@ setClass("SimpleParams",
 #'         \describe{
 #'             \item{\code{out.prob}}{Probability that a gene is an expression
 #'             outlier.}
-#'             \item{\code{out.loProb}}{Probability that an expression outlier
-#'             gene is lowly expressed.}
 #'             \item{\code{out.facLoc}}{Location (meanlog) parameter for the
 #'             expression outlier factor log-normal distribution.}
 #'             \item{\code{out.facScale}}{Scale (sdlog) parameter for the
@@ -182,7 +180,6 @@ setClass("SplatParams",
                    lib.loc = "numeric",
                    lib.scale = "numeric",
                    out.prob = "numeric",
-                   out.loProb = "numeric",
                    out.facLoc = "numeric",
                    out.facScale = "numeric",
                    de.prob = "numeric",
@@ -202,20 +199,19 @@ setClass("SplatParams",
          prototype = prototype(nGroups = 1,
                                groupCells = 100,
                                mean.rate = 0.3,
-                               mean.shape = 0.4,
-                               lib.loc = 10,
-                               lib.scale = 0.5,
-                               out.prob = 0.1,
-                               out.loProb = 0.5,
+                               mean.shape = 0.6,
+                               lib.loc = 11,
+                               lib.scale = 0.2,
+                               out.prob = 0.05,
                                out.facLoc = 4,
-                               out.facScale = 1,
+                               out.facScale = 0.5,
                                de.prob = 0.1,
                                de.downProb = 0.5,
                                de.facLoc = 4,
                                de.facScale = 1,
                                bcv.common = 0.1,
-                               bcv.df = 25,
-                               dropout.present = TRUE,
+                               bcv.df = 60,
+                               dropout.present = FALSE,
                                dropout.mid = 0,
                                dropout.shape = -1,
                                path.from = 0,
@@ -438,4 +434,4 @@ setClass("SCDDParams",
                                sd.range = c(1, 3),
                                modeFC = c(2, 3, 4),
                                varInflation = c(1, 1),
-                               condition = "condition"))
\ No newline at end of file
+                               condition = "condition"))
diff --git a/R/SplatParams-methods.R b/R/SplatParams-methods.R
index e3f773d..07f9916 100644
--- a/R/SplatParams-methods.R
+++ b/R/SplatParams-methods.R
@@ -27,11 +27,6 @@ setValidity("SplatParams", function(object) {
                 lib.loc = checkNumber(v$lib.loc),
                 lib.scale = checkNumber(v$lib.scale, lower = 0),
                 out.prob = checkNumber(v$out.prob, lower = 0, upper = 1),
-                out.loProb = checkNumber(v$out.loProb, lower = 0, upper = 1),
-                out.facLoc = checkNumber(v$lib.loc),
-                out.facScale = checkNumber(v$lib.scale, lower = 0),
-                out.prob = checkNumber(v$out.prob, lower = 0, upper = 1),
-                out.loProb = checkNumber(v$out.loProb, lower = 0, upper = 1),
                 out.facLoc = checkNumber(v$out.facLoc),
                 out.facScale = checkNumber(v$out.facScale, lower = 0),
                 de.prob = checkNumeric(v$de.prob, lower = 0, upper = 1,
@@ -108,7 +103,6 @@ setMethod("show", "SplatParams", function(object) {
                "Library size:"   = c("(Location)"     = "lib.loc",
                                      "(Scale)"        = "lib.scale"),
                "Exprs outliers:" = c("(Probability)"  = "out.prob",
-                                     "(Lo Prob)"      = "out.loProb",
                                      "(Location)"     = "out.facLoc",
                                      "(Scale)"        = "out.facScale"),
                "Diff expr:"      = c("[Probability]"  = "de.prob",
diff --git a/R/compare.R b/R/compare.R
index 82f2658..989417a 100644
--- a/R/compare.R
+++ b/R/compare.R
@@ -24,6 +24,8 @@
 #'             that is zero.}
 #'             \item{\code{ZerosCell}}{Boxplot of the percentage of each cell
 #'             that is zero.}
+#'             \item{\code{MeanZeros}}{Scatter plot with fitted lines showing
+#'             the mean-dropout relationship.}
 #'     }
 #'   }
 #' }
@@ -95,8 +97,8 @@ compareSCESets <- function(sces) {
 
     mean.var <- ggplot(fData.all,
                        aes_string(x = "mean_log_cpm", y = "var_log_cpm",
-                                  olour = "Dataset", fill = "Dataset")) +
-        geom_point() +
+                                  colour = "Dataset", fill = "Dataset")) +
+        geom_point(alpha = 0.2) +
         geom_smooth() +
         xlab(expression(paste("Mean ", log[2], "(CPM + 1)"))) +
         ylab(expression(paste("Variance ", log[2], "(CPM + 1)"))) +
@@ -130,6 +132,16 @@ compareSCESets <- function(sces) {
         ggtitle("Distribution of zeros per cell") +
         theme_minimal()
 
+    mean.zeros <- ggplot(fData.all,
+                         aes_string(x = "mean_log_cpm", y = "pct_dropout",
+                                    colour = "Dataset", fill = "Dataset")) +
+        geom_point(alpha = 0.2) +
+        geom_smooth() +
+        xlab(expression(paste("Mean ", log[2], "(CPM + 1)"))) +
+        ylab(expression(paste("Percentage zeros"))) +
+        ggtitle("Mean-dropout relationship") +
+        theme_minimal()
+
     comparison <- list(FeatureData = fData.all,
                        PhenoData = pData.all,
                        Plots = list(Means = means,
@@ -137,7 +149,8 @@ compareSCESets <- function(sces) {
                                     MeanVar = mean.var,
                                     LibrarySizes = libs,
                                     ZerosGene = z.gene,
-                                    ZerosCell = z.cell))
+                                    ZerosCell = z.cell,
+                                    MeanZeros = mean.zeros))
 
     return(comparison)
 }
diff --git a/R/splat-estimate.R b/R/splat-estimate.R
index 69ddec5..d7a070e 100644
--- a/R/splat-estimate.R
+++ b/R/splat-estimate.R
@@ -59,19 +59,34 @@ splatEstimate.matrix <- function(counts, params = newSplatParams()) {
 #' Estimate Splat mean parameters
 #'
 #' Estimate rate and shape parameters for the gamma distribution used to
-#' simulate gene expression means using the 'moment matching estimation' method
-#' of \code{\link[fitdistrplus]{fitdist}}.
+#' simulate gene expression means.
 #'
 #' @param norm.counts library size normalised counts matrix.
 #' @param params SplatParams object to store estimated values in.
 #'
+#' @details
+#' Parameter for the gamma distribution are estimated by fitting the mean
+#' normalised counts using \code{\link[fitdistrplus]{fitdist}}. The 'maximum
+#' goodness-of-fit estimation' method is used to minimise the Cramer-von Mises
+#' distance. This can fail in some situations, in which case the 'method of
+#' moments estimation' method is used instead. Prior to fitting the means are
+#' winsorized by setting the top and bottom 10 percent of values to the 10th
+#' and 90th percentiles.
+#'
 #' @return SplatParams object with estimated values.
 splatEstMean <- function(norm.counts, params) {
 
     means <- rowMeans(norm.counts)
     means <- means[means != 0]
 
-    fit <- fitdistrplus::fitdist(means, "gamma", method = "mme")
+    means <- winsorize(means, q = 0.1)
+
+    fit <- try(fitdistrplus::fitdist(means, "gamma", method = "mge",
+                                     gof = "CvM"))
+    if (class(fit) == "try-error") {
+        warning("Goodness of fit failed, using Method of Moments")
+        fit <- fitdistrplus::fitdist(means, "gamma", method = "mme")
+    }
 
     params <- setParams(params, mean.shape = unname(fit$estimate["shape"]),
                         mean.rate = unname(fit$estimate["rate"]))
@@ -111,15 +126,12 @@ splatEstLib <- function(counts, params) {
 #' @details
 #' Expression outlier genes are detected using the Median Absolute Deviation
 #' (MAD) from median method. If the log2 mean expression of a gene is greater
-#' than two MADs from the median log2 mean expression it is designated as a
+#' than two MADs above the median log2 mean expression it is designated as an
 #' outlier. The proportion of outlier genes is used to estimate the outlier
-#' probability. The low outlier probability is estimated as the proportion of
-#' outlier genes that have a log2 mean less than the median log2 mean. Factors
-#' for each outlier gene are calculated by dividing mean expression by the
-#' median mean expression. A log-normal distribution is then fitted to these
-#' factors in order to estimate the outlier factor location and scale
-#' parameters. See \code{\link[fitdistrplus]{fitdist}} for details on the
-#' fitting.
+#' probability. Factors for each outlier gene are calculated by dividing mean
+#' expression by the median mean expression. A log-normal distribution is then
+#' fitted to these factors in order to estimate the outlier factor location and
+#' scale parameters using \code{\link[fitdistrplus]{fitdist}}.
 #'
 #' @return SplatParams object with estimated values.
 splatEstOutlier <- function(norm.counts, params) {
@@ -130,34 +142,42 @@ splatEstOutlier <- function(norm.counts, params) {
     med <- median(lmeans)
     mad <- mad(lmeans)
 
-    lo.bound <- med - 2 * mad
-    hi.bound <- med + 2 * mad
+    bound <- med + 2 * mad
+
+    outs <- which(lmeans > bound)
 
-    lo.outs <- which(lmeans < lo.bound)
-    hi.outs <- which(lmeans > hi.bound)
+    prob <- length(outs) / nrow(norm.counts)
 
-    prob <- (length(lo.outs) + length(hi.outs)) / nrow(norm.counts)
-    lo.prob <- length(lo.outs) / (length(lo.outs) + length(hi.outs))
+    params <- setParams(params, out.prob = prob)
 
-    facs <- means[c(lo.outs, hi.outs)] / median(means)
-    fit <- fitdistrplus::fitdist(facs, "lnorm")
+    if (length(outs) > 1) {
+        facs <- means[outs] / median(means)
+        fit <- fitdistrplus::fitdist(facs, "lnorm")
 
-    params <- setParams(params, out.prob = prob, out.loProb = lo.prob,
-                        out.facLoc = unname(fit$estimate["meanlog"]),
-                        out.facScale = unname(fit$estimate["sdlog"]))
+        params <- setParams(params,
+                            out.facLoc = unname(fit$estimate["meanlog"]),
+                            out.facScale = unname(fit$estimate["sdlog"]))
+    }
 
     return(params)
 }
 
 #' Estimate Splat Biological Coefficient of Variation parameters
 #'
-#' Parameters are estimated using the \code{estimateDisp} function in the
-#' \code{edgeR} package. Specifically the common dispersion and prior degrees
-#' of freedom. See \code{\link{estimateDisp}} for details.
+#' Parameters are estimated using the \code{\link[edgeR]{estimateDisp}} function
+#' in the \code{edgeR} package.
 #'
 #' @param counts counts matrix to estimate parameters from.
 #' @param params SplatParams object to store estimated values in.
 #'
+#' @details
+#' The \code{\link[edgeR]{estimateDisp}} function is used to estimate the common
+#' dispersion and prior degrees of freedom. See
+#' \code{\link[edgeR]{estimateDisp}} for details. When estimating parameters on
+#' simulated data we found a broadly linear relationship between the true
+#' underlying common dispersion and the \code{edgR} estimate, therefore we
+#' apply a small correction, \code{disp = 0.1 + 0.25 * edgeR.disp}.
+#'
 #' @return SplatParams object with estimated values.
 splatEstBCV <- function(counts, params) {
 
@@ -165,7 +185,8 @@ splatEstBCV <- function(counts, params) {
     design <- matrix(1, ncol(counts), 1)
     disps <- edgeR::estimateDisp(counts, design = design)
 
-    params <- setParams(params, bcv.common = disps$common.dispersion,
+    params <- setParams(params,
+                        bcv.common = 0.1 + 0.25 * disps$common.dispersion,
                         bcv.df = disps$prior.df)
 
     return(params)
@@ -224,4 +245,4 @@ splatEstDropout <- function(norm.counts, params) {
                         dropout.shape = shape)
 
     return(params)
-}
\ No newline at end of file
+}
diff --git a/R/splat-simulate.R b/R/splat-simulate.R
index 4e3efb0..bb54cb6 100644
--- a/R/splat-simulate.R
+++ b/R/splat-simulate.R
@@ -280,7 +280,6 @@ splatSimGeneMeans <- function(sim, params) {
     mean.shape <- getParam(params, "mean.shape")
     mean.rate <- getParam(params, "mean.rate")
     out.prob <- getParam(params, "out.prob")
-    out.loProb <- getParam(params, "out.loProb")
     out.facLoc <- getParam(params, "out.facLoc")
     out.facScale <- getParam(params, "out.facScale")
 
@@ -288,7 +287,7 @@ splatSimGeneMeans <- function(sim, params) {
     base.means.gene <- rgamma(nGenes, shape = mean.shape, rate = mean.rate)
 
     # Add expression outliers
-    outlier.facs <- getLNormFactors(nGenes, out.prob, out.loProb, out.facLoc,
+    outlier.facs <- getLNormFactors(nGenes, out.prob, 0, out.facLoc,
                                     out.facScale)
     median.means.gene <- median(base.means.gene)
     outlier.means <- median.means.gene * outlier.facs
diff --git a/R/utils.R b/R/utils.R
index 73bb023..39e770d 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -26,4 +26,27 @@ rbindMatched <- function(df1, df2) {
     combined <- rbind(df1[, common.names], df2[, common.names])
 
     return(combined)
-}
\ No newline at end of file
+}
+
+#' Winsorize vector
+#'
+#' Set outliers in a numeric vector to a specified percentile.
+#'
+#' @param x Numeric vector to winsorize
+#' @param q Percentile to set from each end
+#'
+#' @return Winsorized numeric vector
+winsorize <- function(x, q) {
+
+    checkmate::check_numeric(x, any.missing = FALSE)
+    checkmate::check_number(q, lower = 0, upper = 1)
+
+    lohi <- stats::quantile(x, c(q, 1 - q), na.rm = TRUE)
+
+    if (diff(lohi) < 0) { lohi <- rev(lohi) }
+
+    x[!is.na(x) & x < lohi[1]] <- lohi[1]
+    x[!is.na(x) & x > lohi[2]] <- lohi[2]
+
+    return(x)
+}
diff --git a/man/SplatParams.Rd b/man/SplatParams.Rd
index 473507c..14bca2a 100644
--- a/man/SplatParams.Rd
+++ b/man/SplatParams.Rd
@@ -40,8 +40,6 @@ The Splatter simulation requires the following parameters:
         \describe{
             \item{\code{out.prob}}{Probability that a gene is an expression
             outlier.}
-            \item{\code{out.loProb}}{Probability that an expression outlier
-            gene is lowly expressed.}
             \item{\code{out.facLoc}}{Location (meanlog) parameter for the
             expression outlier factor log-normal distribution.}
             \item{\code{out.facScale}}{Scale (sdlog) parameter for the
diff --git a/man/compareSCESets.Rd b/man/compareSCESets.Rd
index b66c525..14eb59d 100644
--- a/man/compareSCESets.Rd
+++ b/man/compareSCESets.Rd
@@ -35,6 +35,8 @@ The return list has three items:
             that is zero.}
             \item{\code{ZerosCell}}{Boxplot of the percentage of each cell
             that is zero.}
+            \item{\code{MeanZeros}}{Scatter plot with fitted lines showing
+            the mean-dropout relationship.}
     }
   }
 }
diff --git a/man/splatEstBCV.Rd b/man/splatEstBCV.Rd
index 9b66b17..5f81642 100644
--- a/man/splatEstBCV.Rd
+++ b/man/splatEstBCV.Rd
@@ -15,7 +15,14 @@ splatEstBCV(counts, params)
 SplatParams object with estimated values.
 }
 \description{
-Parameters are estimated using the \code{estimateDisp} function in the
-\code{edgeR} package. Specifically the common dispersion and prior degrees
-of freedom. See \code{\link{estimateDisp}} for details.
+Parameters are estimated using the \code{\link[edgeR]{estimateDisp}} function
+in the \code{edgeR} package.
+}
+\details{
+The \code{\link[edgeR]{estimateDisp}} function is used to estimate the common
+dispersion and prior degrees of freedom. See
+\code{\link[edgeR]{estimateDisp}} for details. When estimating parameters on
+simulated data we found a broadly linear relationship between the true
+underlying common dispersion and the \code{edgR} estimate, therefore we
+apply a small correction, \code{disp = 0.1 + 0.25 * edgeR.disp}.
 }
diff --git a/man/splatEstMean.Rd b/man/splatEstMean.Rd
index e0892f9..c019f67 100644
--- a/man/splatEstMean.Rd
+++ b/man/splatEstMean.Rd
@@ -16,6 +16,14 @@ SplatParams object with estimated values.
 }
 \description{
 Estimate rate and shape parameters for the gamma distribution used to
-simulate gene expression means using the 'moment matching estimation' method
-of \code{\link[fitdistrplus]{fitdist}}.
+simulate gene expression means.
+}
+\details{
+Parameter for the gamma distribution are estimated by fitting the mean
+normalised counts using \code{\link[fitdistrplus]{fitdist}}. The 'maximum
+goodness-of-fit estimation' method is used to minimise the Cramer-von Mises
+distance. This can fail in some situations, in which case the 'method of
+moments estimation' method is used instead. Prior to fitting the means are
+winsorized by setting the top and bottom 10 percent of values to the 10th
+and 90th percentiles.
 }
diff --git a/man/splatEstOutlier.Rd b/man/splatEstOutlier.Rd
index 81e5685..543b6e6 100644
--- a/man/splatEstOutlier.Rd
+++ b/man/splatEstOutlier.Rd
@@ -21,13 +21,10 @@ median mean expression level.
 \details{
 Expression outlier genes are detected using the Median Absolute Deviation
 (MAD) from median method. If the log2 mean expression of a gene is greater
-than two MADs from the median log2 mean expression it is designated as a
+than two MADs above the median log2 mean expression it is designated as an
 outlier. The proportion of outlier genes is used to estimate the outlier
-probability. The low outlier probability is estimated as the proportion of
-outlier genes that have a log2 mean less than the median log2 mean. Factors
-for each outlier gene are calculated by dividing mean expression by the
-median mean expression. A log-normal distribution is then fitted to these
-factors in order to estimate the outlier factor location and scale
-parameters. See \code{\link[fitdistrplus]{fitdist}} for details on the
-fitting.
+probability. Factors for each outlier gene are calculated by dividing mean
+expression by the median mean expression. A log-normal distribution is then
+fitted to these factors in order to estimate the outlier factor location and
+scale parameters using \code{\link[fitdistrplus]{fitdist}}.
 }
diff --git a/man/winsorize.Rd b/man/winsorize.Rd
new file mode 100644
index 0000000..ddef66f
--- /dev/null
+++ b/man/winsorize.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{winsorize}
+\alias{winsorize}
+\title{Winsorize vector}
+\usage{
+winsorize(x, q)
+}
+\arguments{
+\item{x}{Numeric vector to winsorize}
+
+\item{q}{Percentile to set from each end}
+}
+\value{
+Winsorized numeric vector
+}
+\description{
+Set outliers in a numeric vector to a specified percentile.
+}
diff --git a/vignettes/splatter.Rmd b/vignettes/splatter.Rmd
index 8c1a815..03eb98f 100644
--- a/vignettes/splatter.Rmd
+++ b/vignettes/splatter.Rmd
@@ -102,8 +102,6 @@ The parameters required for the Splat simulation are briefly described here:
       distribution.
 * **Expression outlier parameters**
     * `out.prob` - Probability that a gene is an expression outlier.
-    * `out.loProb` - Probability that an expression outlier gene is lowly
-      expressed (other outliers are highly expressed).
     * `out.facLoc` - Location (meanlog) parameter for the expression outlier
       factor log-normal distribution.
     * `out.facScale` - Scale (sdlog) parameter for the expression outlier factor
-- 
GitLab