Skip to content
Snippets Groups Projects
Commit 3394256b authored by Luke Zappia's avatar Luke Zappia
Browse files

Merge branch 'master' into devel

* master:
  Version 0.99.15
  Remove dropout.present estimation
  Add limits to zeros plots
  Adjust default splat DE fac params
  Add BPPARAM to scDDSimulate
  Add summariseDiff function

From: Luke Zappia <lazappi@users.noreply.github.com>

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/splatter@128698 bc3139a8-67e5-0310-9ffc-ced21a209358
parent fcdc4b84
No related branches found
No related tags found
No related merge requests found
Package: splatter
Type: Package
Title: Simple Simulation of Single-cell RNA Sequencing Data
Version: 0.99.14
Date: 2017-03-28
Version: 0.99.15
Date: 2017-04-14
Author: Luke Zappia
Authors@R:
c(person("Luke", "Zappia", role = c("aut", "cre"),
......
......@@ -39,6 +39,7 @@ export(splatSimulate)
export(splatSimulateGroups)
export(splatSimulatePaths)
export(splatSimulateSingle)
export(summariseDiff)
exportClasses(Lun2Params)
exportClasses(LunParams)
exportClasses(SCDDParams)
......
# 0.99.15
* Add summariseDiff function
* Add BPPARAM argument to scDDSimulate
* Adjust default Spalt DE factor parameters
* Add limits to zeros diff plots
* Remove estimation of dropout.present
# 0.99.14
* Add functions for making comparison panels
......
......@@ -207,8 +207,8 @@ setClass("SplatParams",
out.facScale = 0.5,
de.prob = 0.1,
de.downProb = 0.5,
de.facLoc = 4,
de.facScale = 1,
de.facLoc = 0.1,
de.facScale = 0.4,
bcv.common = 0.1,
bcv.df = 60,
dropout.present = FALSE,
......
......@@ -111,7 +111,7 @@ setMethod("show", "SplatParams", function(object) {
"[Scale]" = "de.facScale"),
"BCV:" = c("(Common Disp)" = "bcv.common",
"(DoF)" = "bcv.df"),
"Dropout:" = c("(Present)" = "dropout.present",
"Dropout:" = c("[Present]" = "dropout.present",
"(Midpoint)" = "dropout.mid",
"(Shape)" = "dropout.shape"),
"Paths:" = c("[From]" = "path.from",
......
......@@ -400,6 +400,7 @@ diffSCESets <- function(sces, ref, point.size = 0.1, point.alpha = 0.1,
colour = "Dataset")) +
geom_hline(yintercept = 0, colour = "red") +
geom_boxplot() +
scale_y_continuous(limits = c(0, 100)) +
scale_colour_manual(values = colours) +
ylab(paste("Rank difference percentage zeros")) +
ggtitle("Difference in zeros per gene") +
......@@ -410,6 +411,7 @@ diffSCESets <- function(sces, ref, point.size = 0.1, point.alpha = 0.1,
colour = "Dataset")) +
geom_hline(yintercept = 0, colour = "red") +
geom_boxplot() +
scale_y_continuous(limits = c(0, 100)) +
scale_colour_manual(values = colours) +
ylab(paste("Rank difference percentage zeros")) +
ggtitle("Difference in zeros per cell") +
......@@ -784,3 +786,66 @@ makeOverallPanel <- function(comp, diff, title = "Overall comparison",
return(panel)
}
#' Summarise diffSCESets
#'
#' Summarise the results of \code{\link{diffSCESets}}. The various
#' properties are sorted, differences calculated, the Median Absolute Deviation
#' taken as the summary statistic and the ranks calculated.
#'
#' @param diff Output from \code{\link{diffSCESets}}
#'
#' @return List with MADs, ranks and both combined in long format
#' @examples
#' sim1 <- splatSimulate(nGenes = 1000, groupCells = 20)
#' sim2 <- simpleSimulate(nGenes = 1000, nCells = 20)
#' difference <- diffSCESets(list(Splat = sim1, Simple = sim2), ref = "Simple")
#' summary <- summariseDiff(difference)
#' names(summary)
#' head(summary$Long)
#' @export
summariseDiff <- function(diff) {
datasets <- unique(diff$PhenoData$Dataset)
fData.mads <- sapply(datasets, function(dataset) {
df <- diff$FeatureData[diff$FeatureData$Dataset == dataset, ]
mean <- median(abs(df$RankDiffMeanLogCPM))
var <- median(abs(df$RankDiffVarLogCPM))
zeros <- median(abs(df$RankDiffZeros))
mean.var <- median(abs(df$MeanRankVarDiff))
mean.zeros <- median(abs(df$MeanRankZerosDiff))
return(c(Mean = mean, Variance = var, ZerosGene = zeros,
MeanVar = mean.var, MeanZeros = mean.zeros))
})
pData.mads <- sapply(datasets, function(dataset) {
df <- diff$PhenoData[diff$PhenoData$Dataset == dataset, ]
lib.size <- median(abs(df$RankDiffLibSize))
zeros <- median(abs(df$RankDiffZeros))
return(c(LibSize = lib.size, ZerosCell = zeros))
})
mads <- data.frame(Dataset = datasets, t(fData.mads), t(pData.mads))
fData.ranks <- matrixStats::rowRanks(fData.mads)
pData.ranks <- matrixStats::rowRanks(pData.mads)
ranks <- data.frame(Dataset = datasets, t(fData.ranks), t(pData.ranks))
colnames(ranks) <- paste0(colnames(mads), "Rank")
mads.long <- stats::reshape(mads, varying = 2:8, direction = "long",
idvar = "Dataset", timevar = "Statistic",
times = colnames(mads)[2:8], v.names = "MAD")
ranks.long <- stats::reshape(ranks, varying = 2:8, direction = "long",
idvar = "Dataset", timevar = "Statistic",
times = colnames(ranks)[2:8], v.names = "Rank")
long <- data.frame(mads.long, Rank = ranks.long$Rank)
row.names(long) <- NULL
summary <- list(MADs = mads, Ranks = ranks, Long = long)
return(summary)
}
......@@ -7,6 +7,9 @@
#' plots.
#' @param plot.file File path to save plots as PDF.
#' @param verbose logical. Whether to print progress messages
#' @param BPPARAM A \code{\link[BiocParallel]{BiocParallelParam}} instance
#' giving the parallel back-end to be used. Default is
#' \code{\link[BiocParallel]{SerialParam}} which uses a single core.
#' @param ... any additional parameter settings to override what is provided in
#' \code{params}.
#'
......@@ -33,8 +36,10 @@
#' }
#' @export
#' @importFrom scater newSCESet
#' @importFrom BiocParallel SerialParam
scDDSimulate <- function(params = newSCDDParams(), plots = FALSE,
plot.file = NULL, verbose = TRUE, ...) {
plot.file = NULL, verbose = TRUE,
BPPARAM = SerialParam(), ...) {
checkmate::assertClass(params, "SCDDParams")
params <- setParams(params, ...)
......@@ -62,7 +67,8 @@ scDDSimulate <- function(params = newSCDDParams(), plots = FALSE,
plot.file = plot.file,
random.seed = getParam(params, "seed"),
varInflation = varInflation,
condition = getParam(params, "condition"))
condition = getParam(params, "condition"),
param = BPPARAM)
} else {
suppressMessages(
scDD.sim <- scDD::simulateSet(SCdat = getParam(params, "SCdat"),
......@@ -79,7 +85,8 @@ scDDSimulate <- function(params = newSCDDParams(), plots = FALSE,
plot.file = plot.file,
random.seed = getParam(params, "seed"),
varInflation = varInflation,
condition = getParam(params, "condition"))
condition = getParam(params, "condition"),
param = BPPARAM)
)
}
......
......@@ -195,8 +195,10 @@ splatEstBCV <- function(counts, params) {
#' Estimate Splat dropout parameters
#'
#' Estimate the midpoint and shape parameters for the logistic function used
#' when simulating dropout. Also estimates whether dropout is likely to be
#' present in the dataset.
#' when simulating dropout.
#'
# #' Also estimates whether dropout is likely to be
# #' present in the dataset.
#'
#' @param norm.counts library size normalised counts matrix.
#' @param params SplatParams object to store estimated values in.
......@@ -204,17 +206,19 @@ splatEstBCV <- function(counts, params) {
#' @details
#' Logistic function parameters are estimated by fitting a logistic function
#' to the relationship between log2 mean gene expression and the proportion of
#' zeros in each gene. See \code{\link[stats]{nls}} for details of fitting. The
#' presence of dropout is determined by comparing the observed number of zeros
#' in each gene to the expected number of zeros from a negative binomial
#' distribution with the gene mean and a dispersion of 0.1. If the maximum
#' difference between the observed number of zeros and the expected number is
#' greater than 10 percent of the number of cells
#' (\code{max(obs.zeros - exp.zeros) > 0.1 * ncol(norm.counts)}) then dropout is
#' considered to be present in the dataset. This is a somewhat crude measure
#' but should give a reasonable indication. A more accurate approach is to look
#' at a plot of log2 mean expression vs the difference between observed and
#' expected number of zeros across all genes.
#' zeros in each gene. See \code{\link[stats]{nls}} for details of fitting.
#'
# #' The
# #' presence of dropout is determined by comparing the observed number of zeros
# #' in each gene to the expected number of zeros from a negative binomial
# #' distribution with the gene mean and a dispersion of 0.1. If the maximum
# #' difference between the observed number of zeros and the expected number is
# #' greater than 10 percent of the number of cells
# #' (\code{max(obs.zeros - exp.zeros) > 0.1 * ncol(norm.counts)}) then dropout
# #' is considered to be present in the dataset. This is a somewhat crude
# #' measure but should give a reasonable indication. A more accurate approach
# #' is to look at a plot of log2 mean expression vs the difference between
# #' observed and expected number of zeros across all genes.
#'
#' @return SplatParams object with estimated values.
#'
......@@ -234,15 +238,14 @@ splatEstDropout <- function(norm.counts, params) {
fit <- nls(y ~ logistic(x, x0 = x0, k = k), data = df,
start = list(x0 = 0, k = -1))
exp.zeros <- dnbinom(0, mu = means, size = 1 / 0.1) * ncol(norm.counts)
#exp.zeros <- dnbinom(0, mu = means, size = 1 / 0.1) * ncol(norm.counts)
present <- max(obs.zeros - exp.zeros) > 0.1 * ncol(norm.counts)
#present <- max(obs.zeros - exp.zeros) > 0.1 * ncol(norm.counts)
mid <- summary(fit)$coefficients["x0", "Estimate"]
shape <- summary(fit)$coefficients["k", "Estimate"]
params <- setParams(params, dropout.present = present, dropout.mid = mid,
dropout.shape = shape)
params <- setParams(params, dropout.mid = mid, dropout.shape = shape)
return(params)
}
......@@ -5,7 +5,7 @@
\title{scDD simulation}
\usage{
scDDSimulate(params = newSCDDParams(), plots = FALSE, plot.file = NULL,
verbose = TRUE, ...)
verbose = TRUE, BPPARAM = SerialParam(), ...)
}
\arguments{
\item{params}{SCDDParams object containing simulation parameters.}
......@@ -17,6 +17,10 @@ plots.}
\item{verbose}{logical. Whether to print progress messages}
\item{BPPARAM}{A \code{\link[BiocParallel]{BiocParallelParam}} instance
giving the parallel back-end to be used. Default is
\code{\link[BiocParallel]{SerialParam}} which uses a single core.}
\item{...}{any additional parameter settings to override what is provided in
\code{params}.}
}
......
......@@ -16,21 +16,10 @@ SplatParams object with estimated values.
}
\description{
Estimate the midpoint and shape parameters for the logistic function used
when simulating dropout. Also estimates whether dropout is likely to be
present in the dataset.
when simulating dropout.
}
\details{
Logistic function parameters are estimated by fitting a logistic function
to the relationship between log2 mean gene expression and the proportion of
zeros in each gene. See \code{\link[stats]{nls}} for details of fitting. The
presence of dropout is determined by comparing the observed number of zeros
in each gene to the expected number of zeros from a negative binomial
distribution with the gene mean and a dispersion of 0.1. If the maximum
difference between the observed number of zeros and the expected number is
greater than 10 percent of the number of cells
(\code{max(obs.zeros - exp.zeros) > 0.1 * ncol(norm.counts)}) then dropout is
considered to be present in the dataset. This is a somewhat crude measure
but should give a reasonable indication. A more accurate approach is to look
at a plot of log2 mean expression vs the difference between observed and
expected number of zeros across all genes.
zeros in each gene. See \code{\link[stats]{nls}} for details of fitting.
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/compare.R
\name{summariseDiff}
\alias{summariseDiff}
\title{Summarise diffSCESets}
\usage{
summariseDiff(diff)
}
\arguments{
\item{diff}{Output from \code{\link{diffSCESets}}}
}
\value{
List with MADs, ranks and both combined in long format
}
\description{
Summarise the results of \code{\link{diffSCESets}}. The various
properties are sorted, differences calculated, the Median Absolute Deviation
taken as the summary statistic and the ranks calculated.
}
\examples{
sim1 <- splatSimulate(nGenes = 1000, groupCells = 20)
sim2 <- simpleSimulate(nGenes = 1000, nCells = 20)
difference <- diffSCESets(list(Splat = sim1, Simple = sim2), ref = "Simple")
summary <- summariseDiff(difference)
names(summary)
head(summary$Long)
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment