From dc9f6823c0e904e6eb20ea507230107810a4fc36 Mon Sep 17 00:00:00 2001
From: Luke Zappia <lazappi@users.noreply.github.com>
Date: Tue, 20 Aug 2019 16:37:29 +1000
Subject: [PATCH] Add expression outliers to Splotch

---
 DESCRIPTION               |  4 ++--
 NEWS.md                   | 18 +++++++++++-------
 R/AllClasses.R            | 12 ++++++++++++
 R/SplotchParams-methods.R | 12 +++++++++---
 R/splotch-estimate.R      | 36 +++++++++++++++++++++++++++++-------
 R/splotch-simulate.R      | 11 +++++++++++
 man/SplotchParams.Rd      |  6 ++++++
 7 files changed, 80 insertions(+), 19 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 237c3be..a64f3c8 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: splatter
 Type: Package
 Title: Simple Simulation of Single-cell RNA Sequencing Data
-Version: 1.9.3.9007
-Date: 2019-08-14
+Version: 1.9.3.9008
+Date: 2019-08-20
 Author: Luke Zappia
 Authors@R:
     c(person("Luke", "Zappia", role = c("aut", "cre"),
diff --git a/NEWS.md b/NEWS.md
index 325e91f..5b71bdc 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,7 @@
+### Version 1.9.3.9008 (2019-08-20)
+
+* Add expression outliers to Splotch
+
 ### Version 1.9.3.9007 (2019-08-14)
 
 * Fix bug in selectFit
@@ -8,8 +12,8 @@
 
 ### Version 1.9.3.9005 (2019-08-08)
 
-* Simulate counts
-* Split into separate functions for each stage
+* Simulate counts in splotchSimulate
+* Split splotchSimulate into separate functions for each stage
 
 ### Version 1.9.3.9004 (2019-08-08)
 
@@ -21,17 +25,17 @@
 
 ### Version 1.9.2.9003 (2019-07-17)
 
-* Topologically sort paths
-* Add library size parameters
-* Simulate cell means
+* Topologically sort Splotch paths
+* Add library size parameters to SplotchParams
+* Simulate cell means in splotchSimulate
 
 ### Version 1.9.2.9002 (2019-07-16)
 
-* Add paths parameters
+* Add paths parameters to SplotchParams
 
 ### Version 1.9.2.9001 (2019-07-16)
 
-* Add mean parameters
+* Add mean parameters to SplotchParams
 
 ### Version 1.9.2.9000 (2019-07-11)
 
diff --git a/R/AllClasses.R b/R/AllClasses.R
index e497256..4124ac3 100644
--- a/R/AllClasses.R
+++ b/R/AllClasses.R
@@ -273,6 +273,12 @@ setClass("SplatParams",
 #'             distribution.}
 #'             \item{\code{mean.rate}}{Rate parameter for the mean gamma
 #'             distribution.}
+#'             \item{\code{mean.outProb}}{Probability that a gene is an
+#'             expression outlier.}
+#'             \item{\code{mean.outFacLoc}}{Location (meanlog) parameter for
+#'             the expression outlier factor log-normal distribution.}
+#'             \item{\code{mean.outFacScale}}{Scale (sdlog) parameter for the
+#'             expression outlier factor log-normal distribution.}
 #'             \item{\code{mean.values}}{Vector of means for each gene.}
 #'         }
 #'     }
@@ -320,6 +326,9 @@ setClass("SplotchParams",
          contains = "Params",
          slots = c(mean.shape = "numeric",
                    mean.rate = "numeric",
+                   mean.outProb = "numeric",
+                   mean.outLoc = "numeric",
+                   mean.outScale = "numeric",
                    mean.values = "numeric",
                    network.graph = "ANY",
                    network.nRegs = "numeric",
@@ -332,6 +341,9 @@ setClass("SplotchParams",
                    cells.design = "data.frame"),
          prototype = prototype(mean.rate = 0.3,
                                mean.shape = 0.6,
+                               mean.outProb = 0.05,
+                               mean.outLoc = 4,
+                               mean.outScale = 0.5,
                                mean.values = numeric(),
                                network.graph = NULL,
                                network.nRegs = 100,
diff --git a/R/SplotchParams-methods.R b/R/SplotchParams-methods.R
index b673e03..9b6e736 100644
--- a/R/SplotchParams-methods.R
+++ b/R/SplotchParams-methods.R
@@ -22,6 +22,9 @@ setValidity("SplotchParams", function(object) {
                 seed = checkmate::checkInt(v$seed, lower = 0),
                 mean.rate = checkmate::checkNumber(v$mean.rate, lower = 0),
                 mean.shape = checkmate::checkNumber(v$mean.shape, lower = 0),
+                mean.outProb = checkNumber(v$mean.outProb, lower = 0, upper = 1),
+                mean.outLoc = checkNumber(v$mean.outLoc),
+                mean.outScale = checkNumber(v$mean.outScale, lower = 0),
                 network.graph = checkmate::checkClass(v$network, "igraph",
                                                       null.ok = TRUE),
                 network.nRegs = checkmate::checkInt(v$network.nRegs,
@@ -115,9 +118,12 @@ setValidity("SplotchParams", function(object) {
 #' @importFrom methods show
 setMethod("show", "SplotchParams", function(object) {
 
-    pp.top <- list("Mean:" = c("(Rate)"    = "mean.rate",
-                               "(Shape)"   = "mean.shape",
-                               "[Values]*" = "mean.values"))
+    pp.top <- list("Mean:" = c("(Rate)"         = "mean.rate",
+                               "(Shape)"        = "mean.shape",
+                               "(Out Prob)"     = "mean.outProb",
+                               "(Out Location)" = "mean.outLoc",
+                               "(Out Scale)"    = "mean.outScale",
+                               "[Values]*"      = "mean.values"))
 
     pp.network <- list("Network:" = c("[Graph]"   = "network.graph",
                                       "[nRegs]"   = "network.nRegs",
diff --git a/R/splotch-estimate.R b/R/splotch-estimate.R
index 7e3230f..36a0792 100644
--- a/R/splotch-estimate.R
+++ b/R/splotch-estimate.R
@@ -72,6 +72,28 @@ splotchEstMean <- function(norm.counts, params, verbose) {
     params <- setParams(params, mean.shape = unname(fit$estimate["shape"]),
                         mean.rate = unname(fit$estimate["rate"]))
 
+    if (verbose) {message("Estimating expression outlier parameters...")}
+    lmeans <- log(means)
+    med <- median(lmeans)
+    mad <- mad(lmeans)
+
+    bound <- med + 1 * mad
+
+    outs <- which(lmeans > bound)
+
+    prob <- length(outs) / nrow(norm.counts)
+
+    params <- setParam(params, "mean.outProb", prob)
+
+    if (length(outs) > 1) {
+        facs <- means[outs] / median(means)
+        fit <- selectFit(facs, "lnorm", verbose = verbose)
+
+        params <- setParams(params,
+                            mean.outLoc = unname(fit$estimate["meanlog"]),
+                            mean.outScale = unname(fit$estimate["sdlog"]))
+    }
+
     return(params)
 }
 
@@ -163,15 +185,15 @@ selectFit <- function(data, distr, weights = NULL, verbose = TRUE) {
         )
     }
 
-    aics <- fitdistrplus::gofstat(fits)$aic
-    # Flatten in case aics is a list
-    aics.flat <- unlist(aics)
-    selected <- which(aics.flat == min(aics.flat, na.rm = TRUE))
+    scores <- fitdistrplus::gofstat(fits)$cvm
+    # Flatten in case scores is a list
+    scores.flat <- unlist(scores)
+    selected <- which(scores.flat == min(scores.flat, na.rm = TRUE))
 
     if (verbose) {
-        # Work around to get name in case aics is a list
-        name <- names(fits)[names(aics) == names(aics.flat)[selected]]
-        message("Selected ", name, " fit using AIC")
+        # Work around to get name in case scores is a list
+        name <- names(fits)[names(scores) == names(scores.flat)[selected]]
+        message("Selected ", name, " fit")
     }
 
     return(fits[[selected]])
diff --git a/R/splotch-simulate.R b/R/splotch-simulate.R
index ad3f1a1..3fa1e82 100644
--- a/R/splotch-simulate.R
+++ b/R/splotch-simulate.R
@@ -148,8 +148,19 @@ splotchSimGeneMeans <- function(params, verbose) {
     nGenes <- getParam(params, "nGenes")
     mean.shape <- getParam(params, "mean.shape")
     mean.rate <- getParam(params, "mean.rate")
+    mean.outProb <- getParam(params, "mean.outProb")
+    mean.outLoc <- getParam(params, "mean.outLoc")
+    mean.outScale <- getParam(params, "mean.outScale")
 
     mean.values <- rgamma(nGenes, shape = mean.shape, rate = mean.rate)
+
+    outlier.facs <- getLNormFactors(nGenes, mean.outProb, 0, mean.outLoc,
+                                    mean.outScale)
+    median.means.gene <- median(mean.values)
+    outlier.means <- median.means.gene * outlier.facs
+    is.outlier <- outlier.facs != 1
+    mean.values[is.outlier] <- outlier.means[is.outlier]
+
     params <- setParam(params, "mean.values", mean.values)
 
     return(params)
diff --git a/man/SplotchParams.Rd b/man/SplotchParams.Rd
index 3c437d2..904b42a 100644
--- a/man/SplotchParams.Rd
+++ b/man/SplotchParams.Rd
@@ -23,6 +23,12 @@ The Splotch simulation uses the following parameters:
             distribution.}
             \item{\code{mean.rate}}{Rate parameter for the mean gamma
             distribution.}
+            \item{\code{mean.outProb}}{Probability that a gene is an
+            expression outlier.}
+            \item{\code{mean.outFacLoc}}{Location (meanlog) parameter for
+            the expression outlier factor log-normal distribution.}
+            \item{\code{mean.outFacScale}}{Scale (sdlog) parameter for the
+            expression outlier factor log-normal distribution.}
             \item{\code{mean.values}}{Vector of means for each gene.}
         }
     }
-- 
GitLab