Skip to content
Snippets Groups Projects
Commit cc0bbdea authored by Luke Zappia's avatar Luke Zappia
Browse files

Merge branch 'master' into devel

* master:
  Version 0.99.10
  Remove out.loProb from vignette
  Update default Splat parameters
  Remove out.loProb parameter
  Update estimation procedure

From: Luke Zappia <lazappi@users.noreply.github.com>

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/splatter@127213 bc3139a8-67e5-0310-9ffc-ced21a209358
parent 32e53deb
No related branches found
No related tags found
No related merge requests found
Package: splatter Package: splatter
Type: Package Type: Package
Title: Simple Simulation of Single-cell RNA Sequencing Data Title: Simple Simulation of Single-cell RNA Sequencing Data
Version: 0.99.9 Version: 0.99.10
Date: 2017-02-02 Date: 2017-03-07
Author: Luke Zappia Author: Luke Zappia
Authors@R: Authors@R:
c(person("Luke", "Zappia", role = c("aut", "cre"), c(person("Luke", "Zappia", role = c("aut", "cre"),
......
# 0.99.10
* Improve Splat estimation procedure
* Update default Splat parameters
* Remove out.loProb Splat parameter
* Add MeanZeros plot to compareSCESets
# 0.99.9 # 0.99.9
* Add addGeneLengths function * Add addGeneLengths function
......
...@@ -96,8 +96,6 @@ setClass("SimpleParams", ...@@ -96,8 +96,6 @@ setClass("SimpleParams",
#' \describe{ #' \describe{
#' \item{\code{out.prob}}{Probability that a gene is an expression #' \item{\code{out.prob}}{Probability that a gene is an expression
#' outlier.} #' outlier.}
#' \item{\code{out.loProb}}{Probability that an expression outlier
#' gene is lowly expressed.}
#' \item{\code{out.facLoc}}{Location (meanlog) parameter for the #' \item{\code{out.facLoc}}{Location (meanlog) parameter for the
#' expression outlier factor log-normal distribution.} #' expression outlier factor log-normal distribution.}
#' \item{\code{out.facScale}}{Scale (sdlog) parameter for the #' \item{\code{out.facScale}}{Scale (sdlog) parameter for the
...@@ -182,7 +180,6 @@ setClass("SplatParams", ...@@ -182,7 +180,6 @@ setClass("SplatParams",
lib.loc = "numeric", lib.loc = "numeric",
lib.scale = "numeric", lib.scale = "numeric",
out.prob = "numeric", out.prob = "numeric",
out.loProb = "numeric",
out.facLoc = "numeric", out.facLoc = "numeric",
out.facScale = "numeric", out.facScale = "numeric",
de.prob = "numeric", de.prob = "numeric",
...@@ -202,20 +199,19 @@ setClass("SplatParams", ...@@ -202,20 +199,19 @@ setClass("SplatParams",
prototype = prototype(nGroups = 1, prototype = prototype(nGroups = 1,
groupCells = 100, groupCells = 100,
mean.rate = 0.3, mean.rate = 0.3,
mean.shape = 0.4, mean.shape = 0.6,
lib.loc = 10, lib.loc = 11,
lib.scale = 0.5, lib.scale = 0.2,
out.prob = 0.1, out.prob = 0.05,
out.loProb = 0.5,
out.facLoc = 4, out.facLoc = 4,
out.facScale = 1, out.facScale = 0.5,
de.prob = 0.1, de.prob = 0.1,
de.downProb = 0.5, de.downProb = 0.5,
de.facLoc = 4, de.facLoc = 4,
de.facScale = 1, de.facScale = 1,
bcv.common = 0.1, bcv.common = 0.1,
bcv.df = 25, bcv.df = 60,
dropout.present = TRUE, dropout.present = FALSE,
dropout.mid = 0, dropout.mid = 0,
dropout.shape = -1, dropout.shape = -1,
path.from = 0, path.from = 0,
...@@ -438,4 +434,4 @@ setClass("SCDDParams", ...@@ -438,4 +434,4 @@ setClass("SCDDParams",
sd.range = c(1, 3), sd.range = c(1, 3),
modeFC = c(2, 3, 4), modeFC = c(2, 3, 4),
varInflation = c(1, 1), varInflation = c(1, 1),
condition = "condition")) condition = "condition"))
\ No newline at end of file
...@@ -27,11 +27,6 @@ setValidity("SplatParams", function(object) { ...@@ -27,11 +27,6 @@ setValidity("SplatParams", function(object) {
lib.loc = checkNumber(v$lib.loc), lib.loc = checkNumber(v$lib.loc),
lib.scale = checkNumber(v$lib.scale, lower = 0), lib.scale = checkNumber(v$lib.scale, lower = 0),
out.prob = checkNumber(v$out.prob, lower = 0, upper = 1), out.prob = checkNumber(v$out.prob, lower = 0, upper = 1),
out.loProb = checkNumber(v$out.loProb, lower = 0, upper = 1),
out.facLoc = checkNumber(v$lib.loc),
out.facScale = checkNumber(v$lib.scale, lower = 0),
out.prob = checkNumber(v$out.prob, lower = 0, upper = 1),
out.loProb = checkNumber(v$out.loProb, lower = 0, upper = 1),
out.facLoc = checkNumber(v$out.facLoc), out.facLoc = checkNumber(v$out.facLoc),
out.facScale = checkNumber(v$out.facScale, lower = 0), out.facScale = checkNumber(v$out.facScale, lower = 0),
de.prob = checkNumeric(v$de.prob, lower = 0, upper = 1, de.prob = checkNumeric(v$de.prob, lower = 0, upper = 1,
...@@ -108,7 +103,6 @@ setMethod("show", "SplatParams", function(object) { ...@@ -108,7 +103,6 @@ setMethod("show", "SplatParams", function(object) {
"Library size:" = c("(Location)" = "lib.loc", "Library size:" = c("(Location)" = "lib.loc",
"(Scale)" = "lib.scale"), "(Scale)" = "lib.scale"),
"Exprs outliers:" = c("(Probability)" = "out.prob", "Exprs outliers:" = c("(Probability)" = "out.prob",
"(Lo Prob)" = "out.loProb",
"(Location)" = "out.facLoc", "(Location)" = "out.facLoc",
"(Scale)" = "out.facScale"), "(Scale)" = "out.facScale"),
"Diff expr:" = c("[Probability]" = "de.prob", "Diff expr:" = c("[Probability]" = "de.prob",
......
...@@ -24,6 +24,8 @@ ...@@ -24,6 +24,8 @@
#' that is zero.} #' that is zero.}
#' \item{\code{ZerosCell}}{Boxplot of the percentage of each cell #' \item{\code{ZerosCell}}{Boxplot of the percentage of each cell
#' that is zero.} #' that is zero.}
#' \item{\code{MeanZeros}}{Scatter plot with fitted lines showing
#' the mean-dropout relationship.}
#' } #' }
#' } #' }
#' } #' }
...@@ -95,8 +97,8 @@ compareSCESets <- function(sces) { ...@@ -95,8 +97,8 @@ compareSCESets <- function(sces) {
mean.var <- ggplot(fData.all, mean.var <- ggplot(fData.all,
aes_string(x = "mean_log_cpm", y = "var_log_cpm", aes_string(x = "mean_log_cpm", y = "var_log_cpm",
olour = "Dataset", fill = "Dataset")) + colour = "Dataset", fill = "Dataset")) +
geom_point() + geom_point(alpha = 0.2) +
geom_smooth() + geom_smooth() +
xlab(expression(paste("Mean ", log[2], "(CPM + 1)"))) + xlab(expression(paste("Mean ", log[2], "(CPM + 1)"))) +
ylab(expression(paste("Variance ", log[2], "(CPM + 1)"))) + ylab(expression(paste("Variance ", log[2], "(CPM + 1)"))) +
...@@ -130,6 +132,16 @@ compareSCESets <- function(sces) { ...@@ -130,6 +132,16 @@ compareSCESets <- function(sces) {
ggtitle("Distribution of zeros per cell") + ggtitle("Distribution of zeros per cell") +
theme_minimal() theme_minimal()
mean.zeros <- ggplot(fData.all,
aes_string(x = "mean_log_cpm", y = "pct_dropout",
colour = "Dataset", fill = "Dataset")) +
geom_point(alpha = 0.2) +
geom_smooth() +
xlab(expression(paste("Mean ", log[2], "(CPM + 1)"))) +
ylab(expression(paste("Percentage zeros"))) +
ggtitle("Mean-dropout relationship") +
theme_minimal()
comparison <- list(FeatureData = fData.all, comparison <- list(FeatureData = fData.all,
PhenoData = pData.all, PhenoData = pData.all,
Plots = list(Means = means, Plots = list(Means = means,
...@@ -137,7 +149,8 @@ compareSCESets <- function(sces) { ...@@ -137,7 +149,8 @@ compareSCESets <- function(sces) {
MeanVar = mean.var, MeanVar = mean.var,
LibrarySizes = libs, LibrarySizes = libs,
ZerosGene = z.gene, ZerosGene = z.gene,
ZerosCell = z.cell)) ZerosCell = z.cell,
MeanZeros = mean.zeros))
return(comparison) return(comparison)
} }
...@@ -59,19 +59,34 @@ splatEstimate.matrix <- function(counts, params = newSplatParams()) { ...@@ -59,19 +59,34 @@ splatEstimate.matrix <- function(counts, params = newSplatParams()) {
#' Estimate Splat mean parameters #' Estimate Splat mean parameters
#' #'
#' Estimate rate and shape parameters for the gamma distribution used to #' Estimate rate and shape parameters for the gamma distribution used to
#' simulate gene expression means using the 'moment matching estimation' method #' simulate gene expression means.
#' of \code{\link[fitdistrplus]{fitdist}}.
#' #'
#' @param norm.counts library size normalised counts matrix. #' @param norm.counts library size normalised counts matrix.
#' @param params SplatParams object to store estimated values in. #' @param params SplatParams object to store estimated values in.
#' #'
#' @details
#' Parameter for the gamma distribution are estimated by fitting the mean
#' normalised counts using \code{\link[fitdistrplus]{fitdist}}. The 'maximum
#' goodness-of-fit estimation' method is used to minimise the Cramer-von Mises
#' distance. This can fail in some situations, in which case the 'method of
#' moments estimation' method is used instead. Prior to fitting the means are
#' winsorized by setting the top and bottom 10 percent of values to the 10th
#' and 90th percentiles.
#'
#' @return SplatParams object with estimated values. #' @return SplatParams object with estimated values.
splatEstMean <- function(norm.counts, params) { splatEstMean <- function(norm.counts, params) {
means <- rowMeans(norm.counts) means <- rowMeans(norm.counts)
means <- means[means != 0] means <- means[means != 0]
fit <- fitdistrplus::fitdist(means, "gamma", method = "mme") means <- winsorize(means, q = 0.1)
fit <- try(fitdistrplus::fitdist(means, "gamma", method = "mge",
gof = "CvM"))
if (class(fit) == "try-error") {
warning("Goodness of fit failed, using Method of Moments")
fit <- fitdistrplus::fitdist(means, "gamma", method = "mme")
}
params <- setParams(params, mean.shape = unname(fit$estimate["shape"]), params <- setParams(params, mean.shape = unname(fit$estimate["shape"]),
mean.rate = unname(fit$estimate["rate"])) mean.rate = unname(fit$estimate["rate"]))
...@@ -111,15 +126,12 @@ splatEstLib <- function(counts, params) { ...@@ -111,15 +126,12 @@ splatEstLib <- function(counts, params) {
#' @details #' @details
#' Expression outlier genes are detected using the Median Absolute Deviation #' Expression outlier genes are detected using the Median Absolute Deviation
#' (MAD) from median method. If the log2 mean expression of a gene is greater #' (MAD) from median method. If the log2 mean expression of a gene is greater
#' than two MADs from the median log2 mean expression it is designated as a #' than two MADs above the median log2 mean expression it is designated as an
#' outlier. The proportion of outlier genes is used to estimate the outlier #' outlier. The proportion of outlier genes is used to estimate the outlier
#' probability. The low outlier probability is estimated as the proportion of #' probability. Factors for each outlier gene are calculated by dividing mean
#' outlier genes that have a log2 mean less than the median log2 mean. Factors #' expression by the median mean expression. A log-normal distribution is then
#' for each outlier gene are calculated by dividing mean expression by the #' fitted to these factors in order to estimate the outlier factor location and
#' median mean expression. A log-normal distribution is then fitted to these #' scale parameters using \code{\link[fitdistrplus]{fitdist}}.
#' factors in order to estimate the outlier factor location and scale
#' parameters. See \code{\link[fitdistrplus]{fitdist}} for details on the
#' fitting.
#' #'
#' @return SplatParams object with estimated values. #' @return SplatParams object with estimated values.
splatEstOutlier <- function(norm.counts, params) { splatEstOutlier <- function(norm.counts, params) {
...@@ -130,34 +142,42 @@ splatEstOutlier <- function(norm.counts, params) { ...@@ -130,34 +142,42 @@ splatEstOutlier <- function(norm.counts, params) {
med <- median(lmeans) med <- median(lmeans)
mad <- mad(lmeans) mad <- mad(lmeans)
lo.bound <- med - 2 * mad bound <- med + 2 * mad
hi.bound <- med + 2 * mad
outs <- which(lmeans > bound)
lo.outs <- which(lmeans < lo.bound) prob <- length(outs) / nrow(norm.counts)
hi.outs <- which(lmeans > hi.bound)
prob <- (length(lo.outs) + length(hi.outs)) / nrow(norm.counts) params <- setParams(params, out.prob = prob)
lo.prob <- length(lo.outs) / (length(lo.outs) + length(hi.outs))
facs <- means[c(lo.outs, hi.outs)] / median(means) if (length(outs) > 1) {
fit <- fitdistrplus::fitdist(facs, "lnorm") facs <- means[outs] / median(means)
fit <- fitdistrplus::fitdist(facs, "lnorm")
params <- setParams(params, out.prob = prob, out.loProb = lo.prob, params <- setParams(params,
out.facLoc = unname(fit$estimate["meanlog"]), out.facLoc = unname(fit$estimate["meanlog"]),
out.facScale = unname(fit$estimate["sdlog"])) out.facScale = unname(fit$estimate["sdlog"]))
}
return(params) return(params)
} }
#' Estimate Splat Biological Coefficient of Variation parameters #' Estimate Splat Biological Coefficient of Variation parameters
#' #'
#' Parameters are estimated using the \code{estimateDisp} function in the #' Parameters are estimated using the \code{\link[edgeR]{estimateDisp}} function
#' \code{edgeR} package. Specifically the common dispersion and prior degrees #' in the \code{edgeR} package.
#' of freedom. See \code{\link{estimateDisp}} for details.
#' #'
#' @param counts counts matrix to estimate parameters from. #' @param counts counts matrix to estimate parameters from.
#' @param params SplatParams object to store estimated values in. #' @param params SplatParams object to store estimated values in.
#' #'
#' @details
#' The \code{\link[edgeR]{estimateDisp}} function is used to estimate the common
#' dispersion and prior degrees of freedom. See
#' \code{\link[edgeR]{estimateDisp}} for details. When estimating parameters on
#' simulated data we found a broadly linear relationship between the true
#' underlying common dispersion and the \code{edgR} estimate, therefore we
#' apply a small correction, \code{disp = 0.1 + 0.25 * edgeR.disp}.
#'
#' @return SplatParams object with estimated values. #' @return SplatParams object with estimated values.
splatEstBCV <- function(counts, params) { splatEstBCV <- function(counts, params) {
...@@ -165,7 +185,8 @@ splatEstBCV <- function(counts, params) { ...@@ -165,7 +185,8 @@ splatEstBCV <- function(counts, params) {
design <- matrix(1, ncol(counts), 1) design <- matrix(1, ncol(counts), 1)
disps <- edgeR::estimateDisp(counts, design = design) disps <- edgeR::estimateDisp(counts, design = design)
params <- setParams(params, bcv.common = disps$common.dispersion, params <- setParams(params,
bcv.common = 0.1 + 0.25 * disps$common.dispersion,
bcv.df = disps$prior.df) bcv.df = disps$prior.df)
return(params) return(params)
...@@ -224,4 +245,4 @@ splatEstDropout <- function(norm.counts, params) { ...@@ -224,4 +245,4 @@ splatEstDropout <- function(norm.counts, params) {
dropout.shape = shape) dropout.shape = shape)
return(params) return(params)
} }
\ No newline at end of file
...@@ -280,7 +280,6 @@ splatSimGeneMeans <- function(sim, params) { ...@@ -280,7 +280,6 @@ splatSimGeneMeans <- function(sim, params) {
mean.shape <- getParam(params, "mean.shape") mean.shape <- getParam(params, "mean.shape")
mean.rate <- getParam(params, "mean.rate") mean.rate <- getParam(params, "mean.rate")
out.prob <- getParam(params, "out.prob") out.prob <- getParam(params, "out.prob")
out.loProb <- getParam(params, "out.loProb")
out.facLoc <- getParam(params, "out.facLoc") out.facLoc <- getParam(params, "out.facLoc")
out.facScale <- getParam(params, "out.facScale") out.facScale <- getParam(params, "out.facScale")
...@@ -288,7 +287,7 @@ splatSimGeneMeans <- function(sim, params) { ...@@ -288,7 +287,7 @@ splatSimGeneMeans <- function(sim, params) {
base.means.gene <- rgamma(nGenes, shape = mean.shape, rate = mean.rate) base.means.gene <- rgamma(nGenes, shape = mean.shape, rate = mean.rate)
# Add expression outliers # Add expression outliers
outlier.facs <- getLNormFactors(nGenes, out.prob, out.loProb, out.facLoc, outlier.facs <- getLNormFactors(nGenes, out.prob, 0, out.facLoc,
out.facScale) out.facScale)
median.means.gene <- median(base.means.gene) median.means.gene <- median(base.means.gene)
outlier.means <- median.means.gene * outlier.facs outlier.means <- median.means.gene * outlier.facs
......
...@@ -26,4 +26,27 @@ rbindMatched <- function(df1, df2) { ...@@ -26,4 +26,27 @@ rbindMatched <- function(df1, df2) {
combined <- rbind(df1[, common.names], df2[, common.names]) combined <- rbind(df1[, common.names], df2[, common.names])
return(combined) return(combined)
} }
\ No newline at end of file
#' Winsorize vector
#'
#' Set outliers in a numeric vector to a specified percentile.
#'
#' @param x Numeric vector to winsorize
#' @param q Percentile to set from each end
#'
#' @return Winsorized numeric vector
winsorize <- function(x, q) {
checkmate::check_numeric(x, any.missing = FALSE)
checkmate::check_number(q, lower = 0, upper = 1)
lohi <- stats::quantile(x, c(q, 1 - q), na.rm = TRUE)
if (diff(lohi) < 0) { lohi <- rev(lohi) }
x[!is.na(x) & x < lohi[1]] <- lohi[1]
x[!is.na(x) & x > lohi[2]] <- lohi[2]
return(x)
}
...@@ -40,8 +40,6 @@ The Splatter simulation requires the following parameters: ...@@ -40,8 +40,6 @@ The Splatter simulation requires the following parameters:
\describe{ \describe{
\item{\code{out.prob}}{Probability that a gene is an expression \item{\code{out.prob}}{Probability that a gene is an expression
outlier.} outlier.}
\item{\code{out.loProb}}{Probability that an expression outlier
gene is lowly expressed.}
\item{\code{out.facLoc}}{Location (meanlog) parameter for the \item{\code{out.facLoc}}{Location (meanlog) parameter for the
expression outlier factor log-normal distribution.} expression outlier factor log-normal distribution.}
\item{\code{out.facScale}}{Scale (sdlog) parameter for the \item{\code{out.facScale}}{Scale (sdlog) parameter for the
......
...@@ -35,6 +35,8 @@ The return list has three items: ...@@ -35,6 +35,8 @@ The return list has three items:
that is zero.} that is zero.}
\item{\code{ZerosCell}}{Boxplot of the percentage of each cell \item{\code{ZerosCell}}{Boxplot of the percentage of each cell
that is zero.} that is zero.}
\item{\code{MeanZeros}}{Scatter plot with fitted lines showing
the mean-dropout relationship.}
} }
} }
} }
......
...@@ -15,7 +15,14 @@ splatEstBCV(counts, params) ...@@ -15,7 +15,14 @@ splatEstBCV(counts, params)
SplatParams object with estimated values. SplatParams object with estimated values.
} }
\description{ \description{
Parameters are estimated using the \code{estimateDisp} function in the Parameters are estimated using the \code{\link[edgeR]{estimateDisp}} function
\code{edgeR} package. Specifically the common dispersion and prior degrees in the \code{edgeR} package.
of freedom. See \code{\link{estimateDisp}} for details. }
\details{
The \code{\link[edgeR]{estimateDisp}} function is used to estimate the common
dispersion and prior degrees of freedom. See
\code{\link[edgeR]{estimateDisp}} for details. When estimating parameters on
simulated data we found a broadly linear relationship between the true
underlying common dispersion and the \code{edgR} estimate, therefore we
apply a small correction, \code{disp = 0.1 + 0.25 * edgeR.disp}.
} }
...@@ -16,6 +16,14 @@ SplatParams object with estimated values. ...@@ -16,6 +16,14 @@ SplatParams object with estimated values.
} }
\description{ \description{
Estimate rate and shape parameters for the gamma distribution used to Estimate rate and shape parameters for the gamma distribution used to
simulate gene expression means using the 'moment matching estimation' method simulate gene expression means.
of \code{\link[fitdistrplus]{fitdist}}. }
\details{
Parameter for the gamma distribution are estimated by fitting the mean
normalised counts using \code{\link[fitdistrplus]{fitdist}}. The 'maximum
goodness-of-fit estimation' method is used to minimise the Cramer-von Mises
distance. This can fail in some situations, in which case the 'method of
moments estimation' method is used instead. Prior to fitting the means are
winsorized by setting the top and bottom 10 percent of values to the 10th
and 90th percentiles.
} }
...@@ -21,13 +21,10 @@ median mean expression level. ...@@ -21,13 +21,10 @@ median mean expression level.
\details{ \details{
Expression outlier genes are detected using the Median Absolute Deviation Expression outlier genes are detected using the Median Absolute Deviation
(MAD) from median method. If the log2 mean expression of a gene is greater (MAD) from median method. If the log2 mean expression of a gene is greater
than two MADs from the median log2 mean expression it is designated as a than two MADs above the median log2 mean expression it is designated as an
outlier. The proportion of outlier genes is used to estimate the outlier outlier. The proportion of outlier genes is used to estimate the outlier
probability. The low outlier probability is estimated as the proportion of probability. Factors for each outlier gene are calculated by dividing mean
outlier genes that have a log2 mean less than the median log2 mean. Factors expression by the median mean expression. A log-normal distribution is then
for each outlier gene are calculated by dividing mean expression by the fitted to these factors in order to estimate the outlier factor location and
median mean expression. A log-normal distribution is then fitted to these scale parameters using \code{\link[fitdistrplus]{fitdist}}.
factors in order to estimate the outlier factor location and scale
parameters. See \code{\link[fitdistrplus]{fitdist}} for details on the
fitting.
} }
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utils.R
\name{winsorize}
\alias{winsorize}
\title{Winsorize vector}
\usage{
winsorize(x, q)
}
\arguments{
\item{x}{Numeric vector to winsorize}
\item{q}{Percentile to set from each end}
}
\value{
Winsorized numeric vector
}
\description{
Set outliers in a numeric vector to a specified percentile.
}
...@@ -102,8 +102,6 @@ The parameters required for the Splat simulation are briefly described here: ...@@ -102,8 +102,6 @@ The parameters required for the Splat simulation are briefly described here:
distribution. distribution.
* **Expression outlier parameters** * **Expression outlier parameters**
* `out.prob` - Probability that a gene is an expression outlier. * `out.prob` - Probability that a gene is an expression outlier.
* `out.loProb` - Probability that an expression outlier gene is lowly
expressed (other outliers are highly expressed).
* `out.facLoc` - Location (meanlog) parameter for the expression outlier * `out.facLoc` - Location (meanlog) parameter for the expression outlier
factor log-normal distribution. factor log-normal distribution.
* `out.facScale` - Scale (sdlog) parameter for the expression outlier factor * `out.facScale` - Scale (sdlog) parameter for the expression outlier factor
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment