Skip to content
Snippets Groups Projects
Commit f9d4f54e authored by Luke Zappia's avatar Luke Zappia
Browse files

Add estimation functions

parent 29667ff1
No related branches found
No related tags found
No related merge requests found
Package: splatter Package: splatter
Type: Package Type: Package
Title: Simple Simulation of Single-cell RNA Sequencing Data Title: Simple Simulation of Single-cell RNA Sequencing Data
Version: 0.1.0 Version: 0.2.0
Date: 2016-10-06
Author: Luke Zappia Author: Luke Zappia
Authors@R: as.person(c( Authors@R: as.person(c(
"Luke Zappia <luke.zappia@mcri.edu.au> [aut, cre]", "Luke Zappia <luke.zappia@mcri.edu.au> [aut, cre]",
...@@ -15,10 +16,15 @@ Description: Splatter is a package for the simulation of single-cell RNA ...@@ -15,10 +16,15 @@ Description: Splatter is a package for the simulation of single-cell RNA
License: GPL-3 + file LICENSE License: GPL-3 + file LICENSE
LazyData: TRUE LazyData: TRUE
Depends: Depends:
R (>= 3.3) R (>= 3.3),
scater
Imports: limma,
fitdistrplus,
edgeR
Suggests:
testthat
biocViews: SingleCell, RNASeq, Transcriptomics, GeneExpression, Sequencing, biocViews: SingleCell, RNASeq, Transcriptomics, GeneExpression, Sequencing,
Software Software
URL: https://github.com/lazappi/splatter URL: https://github.com/lazappi/splatter
BugReports: https://github.com/lazappi/splatter/issues BugReports: https://github.com/lazappi/splatter/issues
RoxygenNote: 5.0.1 RoxygenNote: 5.0.1
Suggests: testthat
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
S3method(print,splatParams) S3method(print,splatParams)
export(checkParams) export(checkParams)
export(defaultParams) export(defaultParams)
export(estimateParams)
export(getParams) export(getParams)
export(mergeParams) export(mergeParams)
export(setParams) export(setParams)
......
#' Estimate simulation parameters
#'
#' Estimate simulation parameters from a real dataset. See the individual
#' estimation functions for more details on how this is done.
#'
#' @param x either a counts matrix or an SCESet object containing count data to
#' estimate parameters from.
#' @param params splatParams object to store estimated values in. If not
#' provided a new object will be created.
#'
#' @return A params object containing the estimated parameters.
#'
#' @seealso
#' \code{\link{estMeanParams}}, \code{\link{estLibParams}},
#' \code{\link{estOutlierParams}}, \code{\link{estBCVParams}},
#' \code{\link{estDropoutParams}}
#'
#' @examples
#' data("sc_example_counts")
#' params <- estimateParams(sc_example_counts)
#' params
#' # Replace defaults with estimated params
#' params <- defaultParams()
#' params <- estimateParams(sc_example_counts, params)
#' params
#' @export
estimateParams <- function(x, params = NULL) UseMethod("estimateParams")
#' @rdname estimateParams
estimateParams.SCESet <- function(sce, params = NULL) {
counts <- scater::counts(sce)
estimateParams(counts, params)
}
#' @rdname estimateParams
estimateParams.matrix <- function(counts, params = NULL) {
if (is.null(params)) {
params <- splatParams()
}
# Normalise for library size and remove all zeros
lib.sizes <- colSums(counts)
lib.med <- median(lib.sizes)
norm.counts <- t(t(counts) / lib.sizes * lib.med)
norm.counts <- norm.counts[rowSums(norm.counts > 0) > 1, ]
params <- estMeanParams(norm.counts, params)
params <- estLibParams(counts, params)
params <- estOutlierParams(norm.counts, params)
params <- estBCVParams(counts, params)
params <- estDropoutParams(norm.counts, params)
return(params)
}
#' Estimate mean parameters
#'
#' Estimate rate and shape parameters for the gamma distribution used to
#' simulate gene expression means. Uses a method of moments approach.
#'
#' @param norm.counts library size normalised counts matrix.
#' @param params splatParams object to store estimated values in.
#'
#' @return splatParams object with estimated values.
#' @examples
#' data("sc_example_counts")
#' norm_ex_counts <- t(t(sc_example_counts) / colSums(sc_example_counts) *
#' median(colSums(sc_example_counts)))
#' params <- params()
#' params <- estMeanParams(norm_ex_counts, params)
#' params
estMeanParams <- function(norm.counts, params) {
means <- rowMeans(norm.counts)
means <- means[means != 0]
z <- log(means)
mean.z <- mean(z)
var.z <- var(z)
alpha <- limma::trigammaInverse(var.z)
beta <- exp(digamma(alpha) - mean.z)
params <- setParams(params, mean.shape = alpha, mean.rate = beta)
return(params)
}
#' Estimate library size parameters
#'
#' A log-normal distribution is fitted to the library sizes and the estimated
#' parameters are added to the params object. See \code{\link{fitdist}} for
#' details on the fitting.
#'
#' @param counts counts matrix to estimate parameters from.
#' @param params splatParams object to store estimated values in.
#'
#' @return splatParams object with estimated values.
#' @examples
#' data("sc_example_counts")
#' params <- params()
#' params <- estLibParams(sc_example_counts, params)
#' params
estLibParams <- function(counts, params) {
lib.sizes <- colSums(counts)
fit <- fitdistrplus::fitdist(lib.sizes, "lnorm")
params <- setParams(params, lib.loc = unname(fit$estimate["meanlog"]),
lib.scale = unname(fit$estimate["sdlog"]))
return(params)
}
#' Estimate expression outlier parameters
#'
#' Parameters are estimated by comparing means of individual genes to the
#' median mean expression level.
#'
#' @param norm.counts library size normalised counts matrix.
#' @param params splatParams object to store estimated values in.
#'
#' @details
#' Expression outlier genes are detected using the Median Absolute Deviation
#' (MAD) from median method. If the log2 mean expression of a gene is greater
#' than two MADs from the median log2 mean expression it is designated as a
#' outlier. The proportion of outlier genes is used to estimate the outlier
#' probability. The low outlier probability is estimated as the proportion of
#' outlier genes that have a log2 mean less than the median log2 mean. Factors
#' for each outlier gene are calculated by dividing mean expression by the
#' median mean expression. A log-normal distribution is then fitted to these
#' factors in order to estimate the outlier factor location and scale
#' parameters. See \code{\link{fitdist}} for details on the fitting.
#'
#' @return splatParams object with estimated values.
#' @examples
#' data("sc_example_counts")
#' norm_ex_counts <- t(t(sc_example_counts) / colSums(sc_example_counts) *
#' median(colSums(sc_example_counts)))
#' params <- params()
#' params <- estOutlierParams(norm_ex_counts, params)
#' params
estOutlierParams <- function(norm.counts, params) {
means <- rowMeans(norm.counts)
lmeans <- log(means)
med <- median(lmeans)
mad <- mad(lmeans)
lo.bound <- med - 2 * mad
hi.bound <- med + 2 * mad
lo.outs <- which(lmeans < lo.bound)
hi.outs <- which(lmeans > hi.bound)
prob <- (length(lo.outs) + length(hi.outs)) / nrow(norm.counts)
lo.prob <- length(lo.outs) / (length(lo.outs) + length(hi.outs))
facs <- means[c(lo.outs, hi.outs)] / median(means)
fit <- fitdistrplus::fitdist(facs, "lnorm")
params <- setParams(params, out.prob = prob, out.loProb = lo.prob,
out.facLoc = unname(fit$estimate["meanlog"]),
out.facScale = unname(fit$estimate["sdlog"]))
return(params)
}
#' Estimate Biological Coefficient of Variation parameters
#'
#' Parameters are estimated using the \code{estimateDisp} function in the
#' \code{edgeR} package. Specifically the common dispersion and prior degrees
#' of freedom. See \code{\link{estimateDisp}} for details.
#'
#' @param counts counts matrix to estimate parameters from.
#' @param params splatParams object to store estimated values in.
#'
#' @return spaltParams object with estimated values.
#' @examples
#' data("sc_example_counts")
#' params <- params()
#' params <- estBCVParams(sc_example_counts, params)
#' params
estBCVParams <- function(counts, params) {
# Add dummy design matrix to avoid print statement
design <- matrix(1, ncol(counts), 1)
disps <- edgeR::estimateDisp(counts, design = design)
params <- setParams(params, bcv.common = disps$common.dispersion,
bcv.DF = disps$prior.df)
return(params)
}
#' Estimate dropout parameters
#'
#' Estimate the midpoint and shape parameters for the logistic function used
#' when simulating dropout. Also estimates whether dropout is likely to be
#' present in the dataset.
#'
#' @param norm.counts library size normalised counts matrix.
#' @param params splatParams object to store estimated values in.
#'
#' @details
#' Logistic function parameters are estimated by fitting a logistic function
#' to the relationship between log2 mean gene expression and the proportion of
#' zeros in each gene. See \code{\link{nls}} for details of fitting. The
#' presence of dropout is determined by comparing the observed number of zeros
#' in each gene to the expected number of zeros from a negative binomial
#' distribution with the gene mean and a dispersion of 0.1. If the maximum
#' difference between the observed number of zeros and the expected number is
#' greater than 10 percent of the number of cells
#' (\code{max(obs.zeros - exp.zeros) > 0.1 * ncol(norm.counts)}) then dropout is
#' considered to be present in the dataset. This is a somewhat crude measure
#' but should give a reasonable indication. A more accurate approach is to look
#' at a plot of log2 mean expression vs the difference between observed and
#' expected number of zeros across all genes.
#'
#' @return Params object with estimated values.
#' @examples
#' data("sc_example_counts")
#' norm_ex_counts <- t(t(sc_example_counts) / colSums(sc_example_counts) *
#' median(colSums(sc_example_counts)))
#' params <- params()
#' params <- estMeanParams(norm_ex_counts, params)
estDropoutParams <- function(norm.counts, params) {
means <- rowMeans(norm.counts)
x <- log(means)
obs.zeros <- rowSums(norm.counts == 0)
y <- obs.zeros / ncol(norm.counts)
df <- data.frame(x, y)
fit <- nls(y ~ logistic(x, x0 = x0, k = k), data = df,
start = list(x0 = 0, k = -1))
exp.zeros <- dnbinom(0, mu = means, size = 1 / 0.1) * ncol(norm.counts)
present <- max(obs.zeros - exp.zeros) > 0.1 * ncol(norm.counts)
mid <- summary(fit)$coefficients["x0", "Estimate"]
shape <- summary(fit)$coefficients["k", "Estimate"]
params <- setParams(params, dropout.present = present, dropout.mid = mid,
dropout.shape = shape)
return(params)
}
\ No newline at end of file
#' Logistic function
#'
#' Implementation of the logistic function
#'
#' @param x value to apply the function to.
#' @param x0 midpoint parameter. Gives the centre of the function.
#' @param k shape parameter. Gives the slope of the function.
#'
#' @return RETURN DESCRIPTION
#' @examples
#' logistic(0, 1, 1)
logistic <- function(x, x0, k) {
1 / (1 + exp(-k * (x - x0)))
}
\ No newline at end of file
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/estimate.R
\name{estBCVParams}
\alias{estBCVParams}
\title{Estimate Biological Coefficient of Variation parameters}
\usage{
estBCVParams(counts, params)
}
\arguments{
\item{counts}{counts matrix to estimate parameters from.}
\item{params}{splatParams object to store estimated values in.}
}
\value{
spaltParams object with estimated values.
}
\description{
Parameters are estimated using the \code{estimateDisp} function in the
\code{edgeR} package. Specifically the common dispersion and prior degrees
of freedom. See \code{\link{estimateDisp}} for details.
}
\examples{
data("sc_example_counts")
params <- params()
params <- estBCVParams(sc_example_counts, params)
params
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/estimate.R
\name{estDropoutParams}
\alias{estDropoutParams}
\title{Estimate dropout parameters}
\usage{
estDropoutParams(norm.counts, params)
}
\arguments{
\item{norm.counts}{library size normalised counts matrix.}
\item{params}{splatParams object to store estimated values in.}
}
\value{
Params object with estimated values.
}
\description{
Estimate the midpoint and shape parameters for the logistic function used
when simulating dropout. Also estimates whether dropout is likely to be
present in the dataset.
}
\details{
Logistic function parameters are estimated by fitting a logistic function
to the relationship between log2 mean gene expression and the proportion of
zeros in each gene. See \code{\link{nls}} for details of fitting. The
presence of dropout is determined by comparing the observed number of zeros
in each gene to the expected number of zeros from a negative binomial
distribution with the gene mean and a dispersion of 0.1. If the maximum
difference between the observed number of zeros and the expected number is
greater than 10 percent of the number of cells
(\code{max(obs.zeros - exp.zeros) > 0.1 * ncol(norm.counts)}) then dropout is
considered to be present in the dataset. This is a somewhat crude measure
but should give a reasonable indication. A more accurate approach is to look
at a plot of log2 mean expression vs the difference between observed and
expected number of zeros across all genes.
}
\examples{
data("sc_example_counts")
norm_ex_counts <- t(t(sc_example_counts) / colSums(sc_example_counts) *
median(colSums(sc_example_counts)))
params <- params()
params <- estMeanParams(norm_ex_counts, params)
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/estimate.R
\name{estLibParams}
\alias{estLibParams}
\title{Estimate library size parameters}
\usage{
estLibParams(counts, params)
}
\arguments{
\item{counts}{counts matrix to estimate parameters from.}
\item{params}{splatParams object to store estimated values in.}
}
\value{
splatParams object with estimated values.
}
\description{
A log-normal distribution is fitted to the library sizes and the estimated
parameters are added to the params object. See \code{\link{fitdist}} for
details on the fitting.
}
\examples{
data("sc_example_counts")
params <- params()
params <- estLibParams(sc_example_counts, params)
params
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/estimate.R
\name{estMeanParams}
\alias{estMeanParams}
\title{Estimate mean parameters}
\usage{
estMeanParams(norm.counts, params)
}
\arguments{
\item{norm.counts}{library size normalised counts matrix.}
\item{params}{splatParams object to store estimated values in.}
}
\value{
splatParams object with estimated values.
}
\description{
Estimate rate and shape parameters for the gamma distribution used to
simulate gene expression means. Uses a method of moments approach.
}
\examples{
data("sc_example_counts")
norm_ex_counts <- t(t(sc_example_counts) / colSums(sc_example_counts) *
median(colSums(sc_example_counts)))
params <- params()
params <- estMeanParams(norm_ex_counts, params)
params
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/estimate.R
\name{estOutlierParams}
\alias{estOutlierParams}
\title{Estimate expression outlier parameters}
\usage{
estOutlierParams(norm.counts, params)
}
\arguments{
\item{norm.counts}{library size normalised counts matrix.}
\item{params}{splatParams object to store estimated values in.}
}
\value{
splatParams object with estimated values.
}
\description{
Parameters are estimated by comparing means of individual genes to the
median mean expression level.
}
\details{
Expression outlier genes are detected using the Median Absolute Deviation
(MAD) from median method. If the log2 mean expression of a gene is greater
than two MADs from the median log2 mean expression it is designated as a
outlier. The proportion of outlier genes is used to estimate the outlier
probability. The low outlier probability is estimated as the proportion of
outlier genes that have a log2 mean less than the median log2 mean. Factors
for each outlier gene are calculated by dividing mean expression by the
median mean expression. A log-normal distribution is then fitted to these
factors in order to estimate the outlier factor location and scale
parameters. See \code{\link{fitdist}} for details on the fitting.
}
\examples{
data("sc_example_counts")
norm_ex_counts <- t(t(sc_example_counts) / colSums(sc_example_counts) *
median(colSums(sc_example_counts)))
params <- params()
params <- estOutlierParams(norm_ex_counts, params)
params
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/estimate.R
\name{estimateParams}
\alias{estimateParams}
\alias{estimateParams.SCESet}
\alias{estimateParams.matrix}
\title{Estimate simulation parameters}
\usage{
estimateParams(x, params = NULL)
\method{estimateParams}{SCESet}(sce, params = NULL)
\method{estimateParams}{matrix}(counts, params = NULL)
}
\arguments{
\item{x}{either a counts matrix or an SCESet object containing count data to
estimate parameters from.}
\item{params}{splatParams object to store estimated values in. If not
provided a new object will be created.}
}
\value{
A params object containing the estimated parameters.
}
\description{
Estimate simulation parameters from a real dataset. See the individual
estimation functions for more details on how this is done.
}
\examples{
data("sc_example_counts")
params <- estimateParams(sc_example_counts)
params
# Replace defaults with estimated params
params <- defaultParams()
params <- estimateParams(sc_example_counts, params)
params
}
\seealso{
\code{\link{estMeanParams}}, \code{\link{estLibParams}},
\code{\link{estOutlierParams}}, \code{\link{estBCVParams}},
\code{\link{estDropoutParams}}
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utils.R
\name{logistic}
\alias{logistic}
\title{Logistic function}
\usage{
logistic(x, x0, k)
}
\arguments{
\item{x}{value to apply the function to.}
\item{x0}{midpoint parameter. Gives the centre of the function.}
\item{k}{shape parameter. Gives the slope of the function.}
}
\value{
RETURN DESCRIPTION
}
\description{
Implementation of the logistic function
}
\examples{
logistic(0, 1, 1)
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment