Add estimation functions

f9d4f54e · Luke Zappia · 29667ff1 · f9d4f54e · f9d4f54e · f9d4f54e
Commit f9d4f54e authored 8 years ago by Luke Zappia
--- a/DESCRIPTION
+++ b/DESCRIPTION
 Package: splatter
 Type: Package
 Title: Simple Simulation of Single-cell RNA Sequencing Data
-Version: 0.1.0
+Version: 0.2.0
+Date: 2016-10-06
 Author: Luke Zappia
 Authors@R: as.person(c(
    "Luke Zappia <luke.zappia@mcri.edu.au> [aut, cre]",
@@ -15,10 +16,15 @@ Description: Splatter is a package for the simulation of single-cell RNA
 License: GPL-3 + file LICENSE
 LazyData: TRUE
 Depends:
-    R (>= 3.3)
+    R (>= 3.3),
+    scater
+Imports: limma,
+    fitdistrplus,
+    edgeR
+Suggests:
+    testthat
 biocViews: SingleCell, RNASeq, Transcriptomics, GeneExpression, Sequencing,
    Software
 URL: https://github.com/lazappi/splatter
 BugReports: https://github.com/lazappi/splatter/issues
 RoxygenNote: 5.0.1
-Suggests: testthat
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -3,6 +3,7 @@
 S3method(print,splatParams)
 export(checkParams)
 export(defaultParams)
+export(estimateParams)
 export(getParams)
 export(mergeParams)
 export(setParams)

--- a/R/estimate.R
+++ b/R/estimate.R
+#' Estimate simulation parameters
+#'
+#' Estimate simulation parameters from a real dataset. See the individual
+#' estimation functions for more details on how this is done.
+#'
+#' @param x either a counts matrix or an SCESet object containing count data to
+#'        estimate parameters from.
+#' @param params splatParams object to store estimated values in. If not
+#'        provided a new object will be created.
+#'
+#' @return A params object containing the estimated parameters.
+#'
+#' @seealso
+#' \code{\link{estMeanParams}},  \code{\link{estLibParams}},
+#' \code{\link{estOutlierParams}}, \code{\link{estBCVParams}},
+#' \code{\link{estDropoutParams}}
+#'
+#' @examples
+#' data("sc_example_counts")
+#' params <- estimateParams(sc_example_counts)
+#' params
+#' # Replace defaults with estimated params
+#' params <- defaultParams()
+#' params <- estimateParams(sc_example_counts, params)
+#' params
+#' @export
+estimateParams <- function(x, params = NULL) UseMethod("estimateParams")
+#' @rdname estimateParams
+estimateParams.SCESet <- function(sce, params = NULL) {
+    counts <- scater::counts(sce)
+    estimateParams(counts, params)
+}
+#' @rdname estimateParams
+estimateParams.matrix <- function(counts, params = NULL) {
+    if (is.null(params)) {
+        params <- splatParams()
+    }
+    # Normalise for library size and remove all zeros
+    lib.sizes <- colSums(counts)
+    lib.med <- median(lib.sizes)
+    norm.counts <- t(t(counts) / lib.sizes * lib.med)
+    norm.counts <- norm.counts[rowSums(norm.counts > 0) > 1, ]
+    params <- estMeanParams(norm.counts, params)
+    params <- estLibParams(counts, params)
+    params <- estOutlierParams(norm.counts, params)
+    params <- estBCVParams(counts, params)
+    params <- estDropoutParams(norm.counts, params)
+    return(params)
+}
+#' Estimate mean parameters
+#'
+#' Estimate rate and shape parameters for the gamma distribution used to
+#' simulate gene expression means. Uses a method of moments approach.
+#'
+#' @param norm.counts library size normalised counts matrix.
+#' @param params splatParams object to store estimated values in.
+#'
+#' @return splatParams object with estimated values.
+#' @examples
+#' data("sc_example_counts")
+#' norm_ex_counts <- t(t(sc_example_counts) / colSums(sc_example_counts) *
+#'                   median(colSums(sc_example_counts)))
+#' params <- params()
+#' params <- estMeanParams(norm_ex_counts, params)
+#' params
+estMeanParams <- function(norm.counts, params) {
+    means <- rowMeans(norm.counts)
+    means <- means[means != 0]
+    z <- log(means)
+    mean.z <- mean(z)
+    var.z <- var(z)
+    alpha <- limma::trigammaInverse(var.z)
+    beta <- exp(digamma(alpha) - mean.z)
+    params <- setParams(params, mean.shape = alpha, mean.rate = beta)
+    return(params)
+}
+#' Estimate library size parameters
+#'
+#' A log-normal distribution is fitted to the library sizes and the estimated
+#' parameters are added to the params object. See \code{\link{fitdist}} for
+#' details on the fitting.
+#'
+#' @param counts counts matrix to estimate parameters from.
+#' @param params splatParams object to store estimated values in.
+#'
+#' @return splatParams object with estimated values.
+#' @examples
+#' data("sc_example_counts")
+#' params <- params()
+#' params <- estLibParams(sc_example_counts, params)
+#' params
+estLibParams <- function(counts, params) {
+    lib.sizes <- colSums(counts)
+    fit <- fitdistrplus::fitdist(lib.sizes, "lnorm")
+    params <- setParams(params, lib.loc = unname(fit$estimate["meanlog"]),
+                        lib.scale = unname(fit$estimate["sdlog"]))
+    return(params)
+}
+#' Estimate expression outlier parameters
+#'
+#' Parameters are estimated by comparing means of individual genes to the
+#' median mean expression level.
+#'
+#' @param norm.counts library size normalised counts matrix.
+#' @param params splatParams object to store estimated values in.
+#'
+#' @details
+#' Expression outlier genes are detected using the Median Absolute Deviation
+#' (MAD) from median method. If the log2 mean expression of a gene is greater
+#' than two MADs from the median log2 mean expression it is designated as a
+#' outlier. The proportion of outlier genes is used to estimate the outlier
+#' probability. The low outlier probability is estimated as the proportion of
+#' outlier genes that have a log2 mean less than the median log2 mean. Factors
+#' for each outlier gene are calculated by dividing mean expression by the
+#' median mean expression. A log-normal distribution is then fitted to these
+#' factors in order to estimate the outlier factor location and scale
+#' parameters. See \code{\link{fitdist}} for details on the fitting.
+#'
+#' @return splatParams object with estimated values.
+#' @examples
+#' data("sc_example_counts")
+#' norm_ex_counts <- t(t(sc_example_counts) / colSums(sc_example_counts) *
+#'                   median(colSums(sc_example_counts)))
+#' params <- params()
+#' params <- estOutlierParams(norm_ex_counts, params)
+#' params
+estOutlierParams <- function(norm.counts, params) {
+    means <- rowMeans(norm.counts)
+    lmeans <- log(means)
+    med <- median(lmeans)
+    mad <- mad(lmeans)
+    lo.bound <- med - 2 * mad
+    hi.bound <- med + 2 * mad
+    lo.outs <- which(lmeans < lo.bound)
+    hi.outs <- which(lmeans > hi.bound)
+    prob <- (length(lo.outs) + length(hi.outs)) / nrow(norm.counts)
+    lo.prob <- length(lo.outs) / (length(lo.outs) + length(hi.outs))
+    facs <- means[c(lo.outs, hi.outs)] / median(means)
+    fit <- fitdistrplus::fitdist(facs, "lnorm")
+    params <- setParams(params, out.prob = prob, out.loProb = lo.prob,
+                        out.facLoc = unname(fit$estimate["meanlog"]),
+                        out.facScale = unname(fit$estimate["sdlog"]))
+    return(params)
+}
+#' Estimate Biological Coefficient of Variation parameters
+#'
+#' Parameters are estimated using the \code{estimateDisp} function in the
+#' \code{edgeR} package. Specifically the common dispersion and prior degrees
+#' of freedom. See \code{\link{estimateDisp}} for details.
+#'
+#' @param counts counts matrix to estimate parameters from.
+#' @param params splatParams object to store estimated values in.
+#'
+#' @return spaltParams object with estimated values.
+#' @examples
+#' data("sc_example_counts")
+#' params <- params()
+#' params <- estBCVParams(sc_example_counts, params)
+#' params
+estBCVParams <- function(counts, params) {
+    # Add dummy design matrix to avoid print statement
+    design <- matrix(1, ncol(counts), 1)
+    disps <- edgeR::estimateDisp(counts, design = design)
+    params <- setParams(params, bcv.common = disps$common.dispersion,
+                        bcv.DF = disps$prior.df)
+    return(params)
+}
+#' Estimate dropout parameters
+#'
+#' Estimate the midpoint and shape parameters for the logistic function used
+#' when simulating dropout. Also estimates whether dropout is likely to be
+#' present in the dataset.
+#'
+#' @param norm.counts library size normalised counts matrix.
+#' @param params splatParams object to store estimated values in.
+#'
+#' @details
+#' Logistic function parameters are estimated by fitting a logistic function
+#' to the relationship between log2 mean gene expression and the proportion of
+#' zeros in each gene. See \code{\link{nls}} for details of fitting. The
+#' presence of dropout is determined by comparing the observed number of zeros
+#' in each gene to the expected number of zeros from a negative binomial
+#' distribution with the gene mean and a dispersion of 0.1. If the maximum
+#' difference between the observed number of zeros and the expected number is
+#' greater than 10 percent of the number of cells
+#' (\code{max(obs.zeros - exp.zeros) > 0.1 * ncol(norm.counts)}) then dropout is
+#' considered to be present in the dataset. This is a somewhat crude measure
+#' but should give a reasonable indication. A more accurate approach is to look
+#' at a plot of log2 mean expression vs the difference between observed and
+#' expected number of zeros across all genes.
+#'
+#' @return Params object with estimated values.
+#' @examples
+#' data("sc_example_counts")
+#' norm_ex_counts <- t(t(sc_example_counts) / colSums(sc_example_counts) *
+#'                   median(colSums(sc_example_counts)))
+#' params <- params()
+#' params <- estMeanParams(norm_ex_counts, params)
+estDropoutParams <- function(norm.counts, params) {
+    means <- rowMeans(norm.counts)
+    x <- log(means)
+    obs.zeros <- rowSums(norm.counts == 0)
+    y <- obs.zeros / ncol(norm.counts)
+    df <- data.frame(x, y)
+    fit <- nls(y ~ logistic(x, x0 = x0, k = k), data = df,
+               start = list(x0 = 0, k = -1))
+    exp.zeros <- dnbinom(0, mu = means, size = 1 / 0.1) * ncol(norm.counts)
+    present <- max(obs.zeros - exp.zeros) > 0.1 * ncol(norm.counts)
+    mid <- summary(fit)$coefficients["x0", "Estimate"]
+    shape <- summary(fit)$coefficients["k", "Estimate"]
+    params <- setParams(params, dropout.present = present, dropout.mid = mid,
+                        dropout.shape = shape)
+    return(params)
+}
\ No newline at end of file
--- a/R/utils.R
+++ b/R/utils.R
+#' Logistic function
+#'
+#' Implementation of the logistic function
+#'
+#' @param x value to apply the function to.
+#' @param x0 midpoint parameter. Gives the centre of the function.
+#' @param k shape parameter. Gives the slope of the function.
+#'
+#' @return RETURN DESCRIPTION
+#' @examples
+#' logistic(0, 1, 1)
+logistic <- function(x, x0, k) {
+    1 / (1 + exp(-k * (x - x0)))
+}
\ No newline at end of file
--- a/man/estBCVParams.Rd
+++ b/man/estBCVParams.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/estimate.R
+\name{estBCVParams}
+\alias{estBCVParams}
+\title{Estimate Biological Coefficient of Variation parameters}
+\usage{
+estBCVParams(counts, params)
+}
+\arguments{
+\item{counts}{counts matrix to estimate parameters from.}
+\item{params}{splatParams object to store estimated values in.}
+}
+\value{
+spaltParams object with estimated values.
+}
+\description{
+Parameters are estimated using the \code{estimateDisp} function in the
+\code{edgeR} package. Specifically the common dispersion and prior degrees
+of freedom. See \code{\link{estimateDisp}} for details.
+}
+\examples{
+data("sc_example_counts")
+params <- params()
+params <- estBCVParams(sc_example_counts, params)
+params
+}
--- a/man/estDropoutParams.Rd
+++ b/man/estDropoutParams.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/estimate.R
+\name{estDropoutParams}
+\alias{estDropoutParams}
+\title{Estimate dropout parameters}
+\usage{
+estDropoutParams(norm.counts, params)
+}
+\arguments{
+\item{norm.counts}{library size normalised counts matrix.}
+\item{params}{splatParams object to store estimated values in.}
+}
+\value{
+Params object with estimated values.
+}
+\description{
+Estimate the midpoint and shape parameters for the logistic function used
+when simulating dropout. Also estimates whether dropout is likely to be
+present in the dataset.
+}
+\details{
+Logistic function parameters are estimated by fitting a logistic function
+to the relationship between log2 mean gene expression and the proportion of
+zeros in each gene. See \code{\link{nls}} for details of fitting. The
+presence of dropout is determined by comparing the observed number of zeros
+in each gene to the expected number of zeros from a negative binomial
+distribution with the gene mean and a dispersion of 0.1. If the maximum
+difference between the observed number of zeros and the expected number is
+greater than 10 percent of the number of cells
+(\code{max(obs.zeros - exp.zeros) > 0.1 * ncol(norm.counts)}) then dropout is
+considered to be present in the dataset. This is a somewhat crude measure
+but should give a reasonable indication. A more accurate approach is to look
+at a plot of log2 mean expression vs the difference between observed and
+expected number of zeros across all genes.
+}
+\examples{
+data("sc_example_counts")
+norm_ex_counts <- t(t(sc_example_counts) / colSums(sc_example_counts) *
+                  median(colSums(sc_example_counts)))
+params <- params()
+params <- estMeanParams(norm_ex_counts, params)
+}
--- a/man/estLibParams.Rd
+++ b/man/estLibParams.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/estimate.R
+\name{estLibParams}
+\alias{estLibParams}
+\title{Estimate library size parameters}
+\usage{
+estLibParams(counts, params)
+}
+\arguments{
+\item{counts}{counts matrix to estimate parameters from.}
+\item{params}{splatParams object to store estimated values in.}
+}
+\value{
+splatParams object with estimated values.
+}
+\description{
+A log-normal distribution is fitted to the library sizes and the estimated
+parameters are added to the params object. See \code{\link{fitdist}} for
+details on the fitting.
+}
+\examples{
+data("sc_example_counts")
+params <- params()
+params <- estLibParams(sc_example_counts, params)
+params
+}
--- a/man/estMeanParams.Rd
+++ b/man/estMeanParams.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/estimate.R
+\name{estMeanParams}
+\alias{estMeanParams}
+\title{Estimate mean parameters}
+\usage{
+estMeanParams(norm.counts, params)
+}
+\arguments{
+\item{norm.counts}{library size normalised counts matrix.}
+\item{params}{splatParams object to store estimated values in.}
+}
+\value{
+splatParams object with estimated values.
+}
+\description{
+Estimate rate and shape parameters for the gamma distribution used to
+simulate gene expression means. Uses a method of moments approach.
+}
+\examples{
+data("sc_example_counts")
+norm_ex_counts <- t(t(sc_example_counts) / colSums(sc_example_counts) *
+                  median(colSums(sc_example_counts)))
+params <- params()
+params <- estMeanParams(norm_ex_counts, params)
+params
+}
--- a/man/estOutlierParams.Rd
+++ b/man/estOutlierParams.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/estimate.R
+\name{estOutlierParams}
+\alias{estOutlierParams}
+\title{Estimate expression outlier parameters}
+\usage{
+estOutlierParams(norm.counts, params)
+}
+\arguments{
+\item{norm.counts}{library size normalised counts matrix.}
+\item{params}{splatParams object to store estimated values in.}
+}
+\value{
+splatParams object with estimated values.
+}
+\description{
+Parameters are estimated by comparing means of individual genes to the
+median mean expression level.
+}
+\details{
+Expression outlier genes are detected using the Median Absolute Deviation
+(MAD) from median method. If the log2 mean expression of a gene is greater
+than two MADs from the median log2 mean expression it is designated as a
+outlier. The proportion of outlier genes is used to estimate the outlier
+probability. The low outlier probability is estimated as the proportion of
+outlier genes that have a log2 mean less than the median log2 mean. Factors
+for each outlier gene are calculated by dividing mean expression by the
+median mean expression. A log-normal distribution is then fitted to these
+factors in order to estimate the outlier factor location and scale
+parameters. See \code{\link{fitdist}} for details on the fitting.
+}
+\examples{
+data("sc_example_counts")
+norm_ex_counts <- t(t(sc_example_counts) / colSums(sc_example_counts) *
+                  median(colSums(sc_example_counts)))
+params <- params()
+params <- estOutlierParams(norm_ex_counts, params)
+params
+}
--- a/man/estimateParams.Rd
+++ b/man/estimateParams.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/estimate.R
+\name{estimateParams}
+\alias{estimateParams}
+\alias{estimateParams.SCESet}
+\alias{estimateParams.matrix}
+\title{Estimate simulation parameters}
+\usage{
+estimateParams(x, params = NULL)
+\method{estimateParams}{SCESet}(sce, params = NULL)
+\method{estimateParams}{matrix}(counts, params = NULL)
+}
+\arguments{
+\item{x}{either a counts matrix or an SCESet object containing count data to
+estimate parameters from.}
+\item{params}{splatParams object to store estimated values in. If not
+provided a new object will be created.}
+}
+\value{
+A params object containing the estimated parameters.
+}
+\description{
+Estimate simulation parameters from a real dataset. See the individual
+estimation functions for more details on how this is done.
+}
+\examples{
+data("sc_example_counts")
+params <- estimateParams(sc_example_counts)
+params
+# Replace defaults with estimated params
+params <- defaultParams()
+params <- estimateParams(sc_example_counts, params)
+params
+}
+\seealso{
+\code{\link{estMeanParams}},  \code{\link{estLibParams}},
+\code{\link{estOutlierParams}}, \code{\link{estBCVParams}},
+\code{\link{estDropoutParams}}
+}
--- a/man/logistic.Rd
+++ b/man/logistic.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{logistic}
+\alias{logistic}
+\title{Logistic function}
+\usage{
+logistic(x, x0, k)
+}
+\arguments{
+\item{x}{value to apply the function to.}
+\item{x0}{midpoint parameter. Gives the centre of the function.}
+\item{k}{shape parameter. Gives the slope of the function.}
+}
+\value{
+RETURN DESCRIPTION
+}
+\description{
+Implementation of the logistic function
+}
+\examples{
+logistic(0, 1, 1)
+}