diff --git a/DESCRIPTION b/DESCRIPTION index c314d23b00e6b89a59b35115977100a678ecbe97..27158eb4ec34ac3f4d98e1edc6202d799745518b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: splatter Type: Package Title: Simple Simulation of Single-cell RNA Sequencing Data -Version: 0.99.14 -Date: 2017-03-28 +Version: 0.99.15 +Date: 2017-04-14 Author: Luke Zappia Authors@R: c(person("Luke", "Zappia", role = c("aut", "cre"), diff --git a/NAMESPACE b/NAMESPACE index e56ae21db7a3edff1c5cfb54b66055001bf93d77..bf36ba081502bbf193bd025b5e0557ac2b84cf95 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -39,6 +39,7 @@ export(splatSimulate) export(splatSimulateGroups) export(splatSimulatePaths) export(splatSimulateSingle) +export(summariseDiff) exportClasses(Lun2Params) exportClasses(LunParams) exportClasses(SCDDParams) diff --git a/NEWS.md b/NEWS.md index 3936f24c405be1e5c880b3ce466d6f684cac493b..dd555cb84324f931ed635f485d5edae3668a6eef 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,11 @@ +# 0.99.15 + +* Add summariseDiff function +* Add BPPARAM argument to scDDSimulate +* Adjust default Spalt DE factor parameters +* Add limits to zeros diff plots +* Remove estimation of dropout.present + # 0.99.14 * Add functions for making comparison panels diff --git a/R/AllClasses.R b/R/AllClasses.R index 508407726476615522391240b783731d90c9f802..0b7f9053a2b91ba4654ea24949447ccd69efd529 100644 --- a/R/AllClasses.R +++ b/R/AllClasses.R @@ -207,8 +207,8 @@ setClass("SplatParams", out.facScale = 0.5, de.prob = 0.1, de.downProb = 0.5, - de.facLoc = 4, - de.facScale = 1, + de.facLoc = 0.1, + de.facScale = 0.4, bcv.common = 0.1, bcv.df = 60, dropout.present = FALSE, diff --git a/R/SplatParams-methods.R b/R/SplatParams-methods.R index 07f9916d2208b24a478d3d3155380b7794a5186e..b1dc35ab4e73282182bfaae6f7e62ddb068f5d48 100644 --- a/R/SplatParams-methods.R +++ b/R/SplatParams-methods.R @@ -111,7 +111,7 @@ setMethod("show", "SplatParams", function(object) { "[Scale]" = "de.facScale"), "BCV:" = c("(Common Disp)" = "bcv.common", "(DoF)" = "bcv.df"), - "Dropout:" = c("(Present)" = "dropout.present", + "Dropout:" = c("[Present]" = "dropout.present", "(Midpoint)" = "dropout.mid", "(Shape)" = "dropout.shape"), "Paths:" = c("[From]" = "path.from", diff --git a/R/compare.R b/R/compare.R index 4e0feb06054e2b8108172f6e47df11d2d872a3a4..c11f53af2dec964b4a7d828bb241b71a4176a002 100644 --- a/R/compare.R +++ b/R/compare.R @@ -400,6 +400,7 @@ diffSCESets <- function(sces, ref, point.size = 0.1, point.alpha = 0.1, colour = "Dataset")) + geom_hline(yintercept = 0, colour = "red") + geom_boxplot() + + scale_y_continuous(limits = c(0, 100)) + scale_colour_manual(values = colours) + ylab(paste("Rank difference percentage zeros")) + ggtitle("Difference in zeros per gene") + @@ -410,6 +411,7 @@ diffSCESets <- function(sces, ref, point.size = 0.1, point.alpha = 0.1, colour = "Dataset")) + geom_hline(yintercept = 0, colour = "red") + geom_boxplot() + + scale_y_continuous(limits = c(0, 100)) + scale_colour_manual(values = colours) + ylab(paste("Rank difference percentage zeros")) + ggtitle("Difference in zeros per cell") + @@ -784,3 +786,66 @@ makeOverallPanel <- function(comp, diff, title = "Overall comparison", return(panel) } + +#' Summarise diffSCESets +#' +#' Summarise the results of \code{\link{diffSCESets}}. The various +#' properties are sorted, differences calculated, the Median Absolute Deviation +#' taken as the summary statistic and the ranks calculated. +#' +#' @param diff Output from \code{\link{diffSCESets}} +#' +#' @return List with MADs, ranks and both combined in long format +#' @examples +#' sim1 <- splatSimulate(nGenes = 1000, groupCells = 20) +#' sim2 <- simpleSimulate(nGenes = 1000, nCells = 20) +#' difference <- diffSCESets(list(Splat = sim1, Simple = sim2), ref = "Simple") +#' summary <- summariseDiff(difference) +#' names(summary) +#' head(summary$Long) +#' @export +summariseDiff <- function(diff) { + + datasets <- unique(diff$PhenoData$Dataset) + + fData.mads <- sapply(datasets, function(dataset) { + df <- diff$FeatureData[diff$FeatureData$Dataset == dataset, ] + mean <- median(abs(df$RankDiffMeanLogCPM)) + var <- median(abs(df$RankDiffVarLogCPM)) + zeros <- median(abs(df$RankDiffZeros)) + mean.var <- median(abs(df$MeanRankVarDiff)) + mean.zeros <- median(abs(df$MeanRankZerosDiff)) + return(c(Mean = mean, Variance = var, ZerosGene = zeros, + MeanVar = mean.var, MeanZeros = mean.zeros)) + }) + + pData.mads <- sapply(datasets, function(dataset) { + df <- diff$PhenoData[diff$PhenoData$Dataset == dataset, ] + lib.size <- median(abs(df$RankDiffLibSize)) + zeros <- median(abs(df$RankDiffZeros)) + return(c(LibSize = lib.size, ZerosCell = zeros)) + }) + + mads <- data.frame(Dataset = datasets, t(fData.mads), t(pData.mads)) + + fData.ranks <- matrixStats::rowRanks(fData.mads) + pData.ranks <- matrixStats::rowRanks(pData.mads) + + ranks <- data.frame(Dataset = datasets, t(fData.ranks), t(pData.ranks)) + colnames(ranks) <- paste0(colnames(mads), "Rank") + + mads.long <- stats::reshape(mads, varying = 2:8, direction = "long", + idvar = "Dataset", timevar = "Statistic", + times = colnames(mads)[2:8], v.names = "MAD") + + ranks.long <- stats::reshape(ranks, varying = 2:8, direction = "long", + idvar = "Dataset", timevar = "Statistic", + times = colnames(ranks)[2:8], v.names = "Rank") + + long <- data.frame(mads.long, Rank = ranks.long$Rank) + row.names(long) <- NULL + + summary <- list(MADs = mads, Ranks = ranks, Long = long) + + return(summary) +} diff --git a/R/scDD-simulate.R b/R/scDD-simulate.R index c91706bfdd0d9cc84277216bc35686060eba653f..478443f68054c276c49d75e9418c93e18b9d4046 100644 --- a/R/scDD-simulate.R +++ b/R/scDD-simulate.R @@ -7,6 +7,9 @@ #' plots. #' @param plot.file File path to save plots as PDF. #' @param verbose logical. Whether to print progress messages +#' @param BPPARAM A \code{\link[BiocParallel]{BiocParallelParam}} instance +#' giving the parallel back-end to be used. Default is +#' \code{\link[BiocParallel]{SerialParam}} which uses a single core. #' @param ... any additional parameter settings to override what is provided in #' \code{params}. #' @@ -33,8 +36,10 @@ #' } #' @export #' @importFrom scater newSCESet +#' @importFrom BiocParallel SerialParam scDDSimulate <- function(params = newSCDDParams(), plots = FALSE, - plot.file = NULL, verbose = TRUE, ...) { + plot.file = NULL, verbose = TRUE, + BPPARAM = SerialParam(), ...) { checkmate::assertClass(params, "SCDDParams") params <- setParams(params, ...) @@ -62,7 +67,8 @@ scDDSimulate <- function(params = newSCDDParams(), plots = FALSE, plot.file = plot.file, random.seed = getParam(params, "seed"), varInflation = varInflation, - condition = getParam(params, "condition")) + condition = getParam(params, "condition"), + param = BPPARAM) } else { suppressMessages( scDD.sim <- scDD::simulateSet(SCdat = getParam(params, "SCdat"), @@ -79,7 +85,8 @@ scDDSimulate <- function(params = newSCDDParams(), plots = FALSE, plot.file = plot.file, random.seed = getParam(params, "seed"), varInflation = varInflation, - condition = getParam(params, "condition")) + condition = getParam(params, "condition"), + param = BPPARAM) ) } diff --git a/R/splat-estimate.R b/R/splat-estimate.R index d7a070e660e868e8f5e8001b7b2704a739797dc9..275b35f1432a6acf86c30f25c39a06ae36278083 100644 --- a/R/splat-estimate.R +++ b/R/splat-estimate.R @@ -195,8 +195,10 @@ splatEstBCV <- function(counts, params) { #' Estimate Splat dropout parameters #' #' Estimate the midpoint and shape parameters for the logistic function used -#' when simulating dropout. Also estimates whether dropout is likely to be -#' present in the dataset. +#' when simulating dropout. +#' +# #' Also estimates whether dropout is likely to be +# #' present in the dataset. #' #' @param norm.counts library size normalised counts matrix. #' @param params SplatParams object to store estimated values in. @@ -204,17 +206,19 @@ splatEstBCV <- function(counts, params) { #' @details #' Logistic function parameters are estimated by fitting a logistic function #' to the relationship between log2 mean gene expression and the proportion of -#' zeros in each gene. See \code{\link[stats]{nls}} for details of fitting. The -#' presence of dropout is determined by comparing the observed number of zeros -#' in each gene to the expected number of zeros from a negative binomial -#' distribution with the gene mean and a dispersion of 0.1. If the maximum -#' difference between the observed number of zeros and the expected number is -#' greater than 10 percent of the number of cells -#' (\code{max(obs.zeros - exp.zeros) > 0.1 * ncol(norm.counts)}) then dropout is -#' considered to be present in the dataset. This is a somewhat crude measure -#' but should give a reasonable indication. A more accurate approach is to look -#' at a plot of log2 mean expression vs the difference between observed and -#' expected number of zeros across all genes. +#' zeros in each gene. See \code{\link[stats]{nls}} for details of fitting. +#' +# #' The +# #' presence of dropout is determined by comparing the observed number of zeros +# #' in each gene to the expected number of zeros from a negative binomial +# #' distribution with the gene mean and a dispersion of 0.1. If the maximum +# #' difference between the observed number of zeros and the expected number is +# #' greater than 10 percent of the number of cells +# #' (\code{max(obs.zeros - exp.zeros) > 0.1 * ncol(norm.counts)}) then dropout +# #' is considered to be present in the dataset. This is a somewhat crude +# #' measure but should give a reasonable indication. A more accurate approach +# #' is to look at a plot of log2 mean expression vs the difference between +# #' observed and expected number of zeros across all genes. #' #' @return SplatParams object with estimated values. #' @@ -234,15 +238,14 @@ splatEstDropout <- function(norm.counts, params) { fit <- nls(y ~ logistic(x, x0 = x0, k = k), data = df, start = list(x0 = 0, k = -1)) - exp.zeros <- dnbinom(0, mu = means, size = 1 / 0.1) * ncol(norm.counts) + #exp.zeros <- dnbinom(0, mu = means, size = 1 / 0.1) * ncol(norm.counts) - present <- max(obs.zeros - exp.zeros) > 0.1 * ncol(norm.counts) + #present <- max(obs.zeros - exp.zeros) > 0.1 * ncol(norm.counts) mid <- summary(fit)$coefficients["x0", "Estimate"] shape <- summary(fit)$coefficients["k", "Estimate"] - params <- setParams(params, dropout.present = present, dropout.mid = mid, - dropout.shape = shape) + params <- setParams(params, dropout.mid = mid, dropout.shape = shape) return(params) } diff --git a/man/scDDSimulate.Rd b/man/scDDSimulate.Rd index a1fe862025bdb20d13a6eb6b638e3c41c967bcf0..387413b2a636f9169a9bc13bd5ed9382cc42d3b5 100644 --- a/man/scDDSimulate.Rd +++ b/man/scDDSimulate.Rd @@ -5,7 +5,7 @@ \title{scDD simulation} \usage{ scDDSimulate(params = newSCDDParams(), plots = FALSE, plot.file = NULL, - verbose = TRUE, ...) + verbose = TRUE, BPPARAM = SerialParam(), ...) } \arguments{ \item{params}{SCDDParams object containing simulation parameters.} @@ -17,6 +17,10 @@ plots.} \item{verbose}{logical. Whether to print progress messages} +\item{BPPARAM}{A \code{\link[BiocParallel]{BiocParallelParam}} instance +giving the parallel back-end to be used. Default is +\code{\link[BiocParallel]{SerialParam}} which uses a single core.} + \item{...}{any additional parameter settings to override what is provided in \code{params}.} } diff --git a/man/splatEstDropout.Rd b/man/splatEstDropout.Rd index 669466993d3587dc8d840807483b366f6d7062aa..a2cf09d8272cdb2f5b698e556017192dd9368106 100644 --- a/man/splatEstDropout.Rd +++ b/man/splatEstDropout.Rd @@ -16,21 +16,10 @@ SplatParams object with estimated values. } \description{ Estimate the midpoint and shape parameters for the logistic function used -when simulating dropout. Also estimates whether dropout is likely to be -present in the dataset. +when simulating dropout. } \details{ Logistic function parameters are estimated by fitting a logistic function to the relationship between log2 mean gene expression and the proportion of -zeros in each gene. See \code{\link[stats]{nls}} for details of fitting. The -presence of dropout is determined by comparing the observed number of zeros -in each gene to the expected number of zeros from a negative binomial -distribution with the gene mean and a dispersion of 0.1. If the maximum -difference between the observed number of zeros and the expected number is -greater than 10 percent of the number of cells -(\code{max(obs.zeros - exp.zeros) > 0.1 * ncol(norm.counts)}) then dropout is -considered to be present in the dataset. This is a somewhat crude measure -but should give a reasonable indication. A more accurate approach is to look -at a plot of log2 mean expression vs the difference between observed and -expected number of zeros across all genes. +zeros in each gene. See \code{\link[stats]{nls}} for details of fitting. } diff --git a/man/summariseDiff.Rd b/man/summariseDiff.Rd new file mode 100644 index 0000000000000000000000000000000000000000..704cc86ebf7a1e0090513fe7919a2b01c27434a0 --- /dev/null +++ b/man/summariseDiff.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compare.R +\name{summariseDiff} +\alias{summariseDiff} +\title{Summarise diffSCESets} +\usage{ +summariseDiff(diff) +} +\arguments{ +\item{diff}{Output from \code{\link{diffSCESets}}} +} +\value{ +List with MADs, ranks and both combined in long format +} +\description{ +Summarise the results of \code{\link{diffSCESets}}. The various +properties are sorted, differences calculated, the Median Absolute Deviation +taken as the summary statistic and the ranks calculated. +} +\examples{ +sim1 <- splatSimulate(nGenes = 1000, groupCells = 20) +sim2 <- simpleSimulate(nGenes = 1000, nCells = 20) +difference <- diffSCESets(list(Splat = sim1, Simple = sim2), ref = "Simple") +summary <- summariseDiff(difference) +names(summary) +head(summary$Long) +}