Merge branch 'master' into devel

* master: Version 0.99.15 Remove dropout.present estimation Add limits to zeros plots Adjust default splat DE fac params Add BPPARAM to scDDSimulate Add summariseDiff function From: Luke Zappia <lazappi@users.noreply.github.com> git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/splatter@128698 bc3139a8-67e5-0310-9ffc-ced21a209358

Merge branch 'master' into devel
3394256b · Luke Zappia · fcdc4b84 · 3394256b · 3394256b · 3394256b
Commit 3394256b authored 7 years ago by Luke Zappia
--- a/DESCRIPTION
+++ b/DESCRIPTION
 Package: splatter
 Type: Package
 Title: Simple Simulation of Single-cell RNA Sequencing Data
-Version: 0.99.14
-Date: 2017-03-28
+Version: 0.99.15
+Date: 2017-04-14
 Author: Luke Zappia
 Authors@R:
    c(person("Luke", "Zappia", role = c("aut", "cre"),

--- a/NAMESPACE
+++ b/NAMESPACE
@@ -39,6 +39,7 @@ export(splatSimulate)
 export(splatSimulateGroups)
 export(splatSimulatePaths)
 export(splatSimulateSingle)
+export(summariseDiff)
 exportClasses(Lun2Params)
 exportClasses(LunParams)
 exportClasses(SCDDParams)

--- a/NEWS.md
+++ b/NEWS.md
+# 0.99.15
+
+* Add summariseDiff function
+* Add BPPARAM argument to scDDSimulate
+* Adjust default Spalt DE factor parameters
+* Add limits to zeros diff plots
+* Remove estimation of dropout.present
+
 # 0.99.14

 * Add functions for making comparison panels

--- a/R/AllClasses.R
+++ b/R/AllClasses.R
@@ -207,8 +207,8 @@ setClass("SplatParams",
                               out.facScale = 0.5,
                               de.prob = 0.1,
                               de.downProb = 0.5,
-                               de.facLoc = 4,
-                               de.facScale = 1,
+                               de.facLoc = 0.1,
+                               de.facScale = 0.4,
                               bcv.common = 0.1,
                               bcv.df = 60,
                               dropout.present = FALSE,

--- a/R/SplatParams-methods.R
+++ b/R/SplatParams-methods.R
@@ -111,7 +111,7 @@ setMethod("show", "SplatParams", function(object) {
                                     "[Scale]"        = "de.facScale"),
               "BCV:"            = c("(Common Disp)"  = "bcv.common",
                                     "(DoF)"          = "bcv.df"),
-               "Dropout:"        = c("(Present)"      = "dropout.present",
+               "Dropout:"        = c("[Present]"      = "dropout.present",
                                     "(Midpoint)"     = "dropout.mid",
                                     "(Shape)"        = "dropout.shape"),
               "Paths:"          = c("[From]"         = "path.from",

--- a/R/compare.R
+++ b/R/compare.R
@@ -400,6 +400,7 @@ diffSCESets <- function(sces, ref, point.size = 0.1, point.alpha = 0.1,
                                colour = "Dataset")) +
        geom_hline(yintercept = 0, colour = "red") +
        geom_boxplot() +
+        scale_y_continuous(limits = c(0, 100)) +
        scale_colour_manual(values = colours) +
        ylab(paste("Rank difference percentage zeros")) +
        ggtitle("Difference in zeros per gene") +
@@ -410,6 +411,7 @@ diffSCESets <- function(sces, ref, point.size = 0.1, point.alpha = 0.1,
                                colour = "Dataset")) +
        geom_hline(yintercept = 0, colour = "red") +
        geom_boxplot() +
+        scale_y_continuous(limits = c(0, 100)) +
        scale_colour_manual(values = colours) +
        ylab(paste("Rank difference percentage zeros")) +
        ggtitle("Difference in zeros per cell") +
@@ -784,3 +786,66 @@ makeOverallPanel <- function(comp, diff, title = "Overall comparison",

    return(panel)
 }
+
+#' Summarise diffSCESets
+#'
+#' Summarise the results of \code{\link{diffSCESets}}. The various
+#' properties are sorted, differences calculated, the Median Absolute Deviation
+#' taken as the summary statistic and the ranks calculated.
+#'
+#' @param diff Output from \code{\link{diffSCESets}}
+#'
+#' @return List with MADs, ranks and both combined in long format
+#' @examples
+#' sim1 <- splatSimulate(nGenes = 1000, groupCells = 20)
+#' sim2 <- simpleSimulate(nGenes = 1000, nCells = 20)
+#' difference <- diffSCESets(list(Splat = sim1, Simple = sim2), ref = "Simple")
+#' summary <- summariseDiff(difference)
+#' names(summary)
+#' head(summary$Long)
+#' @export
+summariseDiff <- function(diff) {
+
+    datasets <- unique(diff$PhenoData$Dataset)
+
+    fData.mads <- sapply(datasets, function(dataset) {
+        df <- diff$FeatureData[diff$FeatureData$Dataset == dataset, ]
+        mean <- median(abs(df$RankDiffMeanLogCPM))
+        var <- median(abs(df$RankDiffVarLogCPM))
+        zeros <- median(abs(df$RankDiffZeros))
+        mean.var <- median(abs(df$MeanRankVarDiff))
+        mean.zeros <- median(abs(df$MeanRankZerosDiff))
+        return(c(Mean = mean, Variance = var, ZerosGene = zeros,
+                 MeanVar = mean.var, MeanZeros = mean.zeros))
+    })
+
+    pData.mads <- sapply(datasets, function(dataset) {
+        df <- diff$PhenoData[diff$PhenoData$Dataset == dataset, ]
+        lib.size <- median(abs(df$RankDiffLibSize))
+        zeros <- median(abs(df$RankDiffZeros))
+        return(c(LibSize = lib.size, ZerosCell = zeros))
+    })
+
+    mads <- data.frame(Dataset = datasets, t(fData.mads), t(pData.mads))
+
+    fData.ranks <- matrixStats::rowRanks(fData.mads)
+    pData.ranks <- matrixStats::rowRanks(pData.mads)
+
+    ranks <- data.frame(Dataset = datasets, t(fData.ranks), t(pData.ranks))
+    colnames(ranks) <- paste0(colnames(mads), "Rank")
+
+    mads.long <- stats::reshape(mads, varying = 2:8, direction = "long",
+                                idvar = "Dataset", timevar = "Statistic",
+                                times = colnames(mads)[2:8], v.names = "MAD")
+
+    ranks.long <- stats::reshape(ranks, varying = 2:8, direction = "long",
+                                 idvar = "Dataset", timevar = "Statistic",
+                                 times = colnames(ranks)[2:8], v.names = "Rank")
+
+    long <- data.frame(mads.long, Rank = ranks.long$Rank)
+    row.names(long) <- NULL
+
+    summary <- list(MADs = mads, Ranks = ranks, Long = long)
+
+    return(summary)
+}
--- a/R/scDD-simulate.R
+++ b/R/scDD-simulate.R
@@ -7,6 +7,9 @@
 #' plots.
 #' @param plot.file File path to save plots as PDF.
 #' @param verbose logical. Whether to print progress messages
+#' @param BPPARAM A \code{\link[BiocParallel]{BiocParallelParam}} instance
+#'        giving the parallel back-end to be used. Default is
+#'        \code{\link[BiocParallel]{SerialParam}} which uses a single core.
 #' @param ... any additional parameter settings to override what is provided in
 #'        \code{params}.
 #'
@@ -33,8 +36,10 @@
 #' }
 #' @export
 #' @importFrom scater newSCESet
+#' @importFrom BiocParallel SerialParam
 scDDSimulate <- function(params = newSCDDParams(), plots = FALSE,
-                         plot.file = NULL, verbose = TRUE, ...) {
+                         plot.file = NULL, verbose = TRUE,
+                         BPPARAM = SerialParam(), ...) {

    checkmate::assertClass(params, "SCDDParams")
    params <- setParams(params, ...)
@@ -62,7 +67,8 @@ scDDSimulate <- function(params = newSCDDParams(), plots = FALSE,
                                      plot.file = plot.file,
                                      random.seed = getParam(params, "seed"),
                                      varInflation = varInflation,
-                                      condition = getParam(params, "condition"))
+                                      condition = getParam(params, "condition"),
+                                      param = BPPARAM)
    } else {
        suppressMessages(
        scDD.sim <- scDD::simulateSet(SCdat = getParam(params, "SCdat"),
@@ -79,7 +85,8 @@ scDDSimulate <- function(params = newSCDDParams(), plots = FALSE,
                                      plot.file = plot.file,
                                      random.seed = getParam(params, "seed"),
                                      varInflation = varInflation,
-                                      condition = getParam(params, "condition"))
+                                      condition = getParam(params, "condition"),
+                                      param = BPPARAM)
        )
    }


--- a/R/splat-estimate.R
+++ b/R/splat-estimate.R
@@ -195,8 +195,10 @@ splatEstBCV <- function(counts, params) {
 #' Estimate Splat dropout parameters
 #'
 #' Estimate the midpoint and shape parameters for the logistic function used
-#' when simulating dropout. Also estimates whether dropout is likely to be
-#' present in the dataset.
+#' when simulating dropout.
+#'
+# #' Also estimates whether dropout is likely to be
+# #' present in the dataset.
 #'
 #' @param norm.counts library size normalised counts matrix.
 #' @param params SplatParams object to store estimated values in.
@@ -204,17 +206,19 @@ splatEstBCV <- function(counts, params) {
 #' @details
 #' Logistic function parameters are estimated by fitting a logistic function
 #' to the relationship between log2 mean gene expression and the proportion of
-#' zeros in each gene. See \code{\link[stats]{nls}} for details of fitting. The
-#' presence of dropout is determined by comparing the observed number of zeros
-#' in each gene to the expected number of zeros from a negative binomial
-#' distribution with the gene mean and a dispersion of 0.1. If the maximum
-#' difference between the observed number of zeros and the expected number is
-#' greater than 10 percent of the number of cells
-#' (\code{max(obs.zeros - exp.zeros) > 0.1 * ncol(norm.counts)}) then dropout is
-#' considered to be present in the dataset. This is a somewhat crude measure
-#' but should give a reasonable indication. A more accurate approach is to look
-#' at a plot of log2 mean expression vs the difference between observed and
-#' expected number of zeros across all genes.
+#' zeros in each gene. See \code{\link[stats]{nls}} for details of fitting.
+#'
+# #' The
+# #' presence of dropout is determined by comparing the observed number of zeros
+# #' in each gene to the expected number of zeros from a negative binomial
+# #' distribution with the gene mean and a dispersion of 0.1. If the maximum
+# #' difference between the observed number of zeros and the expected number is
+# #' greater than 10 percent of the number of cells
+# #' (\code{max(obs.zeros - exp.zeros) > 0.1 * ncol(norm.counts)}) then dropout
+# #' is considered to be present in the dataset. This is a somewhat crude
+# #' measure but should give a reasonable indication. A more accurate approach
+# #' is to look at a plot of log2 mean expression vs the difference between
+# #' observed and expected number of zeros across all genes.
 #'
 #' @return SplatParams object with estimated values.
 #'
@@ -234,15 +238,14 @@ splatEstDropout <- function(norm.counts, params) {
    fit <- nls(y ~ logistic(x, x0 = x0, k = k), data = df,
               start = list(x0 = 0, k = -1))

-    exp.zeros <- dnbinom(0, mu = means, size = 1 / 0.1) * ncol(norm.counts)
+    #exp.zeros <- dnbinom(0, mu = means, size = 1 / 0.1) * ncol(norm.counts)

-    present <- max(obs.zeros - exp.zeros) > 0.1 * ncol(norm.counts)
+    #present <- max(obs.zeros - exp.zeros) > 0.1 * ncol(norm.counts)

    mid <- summary(fit)$coefficients["x0", "Estimate"]
    shape <- summary(fit)$coefficients["k", "Estimate"]

-    params <- setParams(params, dropout.present = present, dropout.mid = mid,
-                        dropout.shape = shape)
+    params <- setParams(params, dropout.mid = mid, dropout.shape = shape)

    return(params)
 }
--- a/man/scDDSimulate.Rd
+++ b/man/scDDSimulate.Rd
@@ -5,7 +5,7 @@
 \title{scDD simulation}
 \usage{
 scDDSimulate(params = newSCDDParams(), plots = FALSE, plot.file = NULL,
-  verbose = TRUE, ...)
+  verbose = TRUE, BPPARAM = SerialParam(), ...)
 }
 \arguments{
 \item{params}{SCDDParams object containing simulation parameters.}
@@ -17,6 +17,10 @@ plots.}

 \item{verbose}{logical. Whether to print progress messages}

+\item{BPPARAM}{A \code{\link[BiocParallel]{BiocParallelParam}} instance
+giving the parallel back-end to be used. Default is
+\code{\link[BiocParallel]{SerialParam}} which uses a single core.}
+
 \item{...}{any additional parameter settings to override what is provided in
 \code{params}.}
 }

--- a/man/splatEstDropout.Rd
+++ b/man/splatEstDropout.Rd
@@ -16,21 +16,10 @@ SplatParams object with estimated values.
 }
 \description{
 Estimate the midpoint and shape parameters for the logistic function used
-when simulating dropout. Also estimates whether dropout is likely to be
-present in the dataset.
+when simulating dropout.
 }
 \details{
 Logistic function parameters are estimated by fitting a logistic function
 to the relationship between log2 mean gene expression and the proportion of
-zeros in each gene. See \code{\link[stats]{nls}} for details of fitting. The
-presence of dropout is determined by comparing the observed number of zeros
-in each gene to the expected number of zeros from a negative binomial
-distribution with the gene mean and a dispersion of 0.1. If the maximum
-difference between the observed number of zeros and the expected number is
-greater than 10 percent of the number of cells
-(\code{max(obs.zeros - exp.zeros) > 0.1 * ncol(norm.counts)}) then dropout is
-considered to be present in the dataset. This is a somewhat crude measure
-but should give a reasonable indication. A more accurate approach is to look
-at a plot of log2 mean expression vs the difference between observed and
-expected number of zeros across all genes.
+zeros in each gene. See \code{\link[stats]{nls}} for details of fitting.
 }
--- a/man/summariseDiff.Rd
+++ b/man/summariseDiff.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compare.R
+\name{summariseDiff}
+\alias{summariseDiff}
+\title{Summarise diffSCESets}
+\usage{
+summariseDiff(diff)
+}
+\arguments{
+\item{diff}{Output from \code{\link{diffSCESets}}}
+}
+\value{
+List with MADs, ranks and both combined in long format
+}
+\description{
+Summarise the results of \code{\link{diffSCESets}}. The various
+properties are sorted, differences calculated, the Median Absolute Deviation
+taken as the summary statistic and the ranks calculated.
+}
+\examples{
+sim1 <- splatSimulate(nGenes = 1000, groupCells = 20)
+sim2 <- simpleSimulate(nGenes = 1000, nCells = 20)
+difference <- diffSCESets(list(Splat = sim1, Simple = sim2), ref = "Simple")
+summary <- summariseDiff(difference)
+names(summary)
+head(summary$Long)
+}