diff --git a/R/AllClasses.R b/R/AllClasses.R index c8066bd05d9353b0106d3e5521c3fa58afcac6c5..568b391450cd5124b83af544bf93130f2c20ea1c 100644 --- a/R/AllClasses.R +++ b/R/AllClasses.R @@ -96,8 +96,6 @@ setClass("SimpleParams", #' \describe{ #' \item{\code{out.prob}}{Probability that a gene is an expression #' outlier.} -#' \item{\code{out.loProb}}{Probability that an expression outlier -#' gene is lowly expressed.} #' \item{\code{out.facLoc}}{Location (meanlog) parameter for the #' expression outlier factor log-normal distribution.} #' \item{\code{out.facScale}}{Scale (sdlog) parameter for the @@ -182,7 +180,6 @@ setClass("SplatParams", lib.loc = "numeric", lib.scale = "numeric", out.prob = "numeric", - out.loProb = "numeric", out.facLoc = "numeric", out.facScale = "numeric", de.prob = "numeric", @@ -202,20 +199,19 @@ setClass("SplatParams", prototype = prototype(nGroups = 1, groupCells = 100, mean.rate = 0.3, - mean.shape = 0.4, - lib.loc = 10, - lib.scale = 0.5, - out.prob = 0.1, - out.loProb = 0.5, + mean.shape = 0.6, + lib.loc = 11, + lib.scale = 0.2, + out.prob = 0.05, out.facLoc = 4, - out.facScale = 1, + out.facScale = 0.5, de.prob = 0.1, de.downProb = 0.5, de.facLoc = 4, de.facScale = 1, bcv.common = 0.1, - bcv.df = 25, - dropout.present = TRUE, + bcv.df = 60, + dropout.present = FALSE, dropout.mid = 0, dropout.shape = -1, path.from = 0, @@ -438,4 +434,4 @@ setClass("SCDDParams", sd.range = c(1, 3), modeFC = c(2, 3, 4), varInflation = c(1, 1), - condition = "condition")) \ No newline at end of file + condition = "condition")) diff --git a/R/SplatParams-methods.R b/R/SplatParams-methods.R index e3f773d6c26622dfd475f57110eb50c608019b43..07f9916d2208b24a478d3d3155380b7794a5186e 100644 --- a/R/SplatParams-methods.R +++ b/R/SplatParams-methods.R @@ -27,11 +27,6 @@ setValidity("SplatParams", function(object) { lib.loc = checkNumber(v$lib.loc), lib.scale = checkNumber(v$lib.scale, lower = 0), out.prob = checkNumber(v$out.prob, lower = 0, upper = 1), - out.loProb = checkNumber(v$out.loProb, lower = 0, upper = 1), - out.facLoc = checkNumber(v$lib.loc), - out.facScale = checkNumber(v$lib.scale, lower = 0), - out.prob = checkNumber(v$out.prob, lower = 0, upper = 1), - out.loProb = checkNumber(v$out.loProb, lower = 0, upper = 1), out.facLoc = checkNumber(v$out.facLoc), out.facScale = checkNumber(v$out.facScale, lower = 0), de.prob = checkNumeric(v$de.prob, lower = 0, upper = 1, @@ -108,7 +103,6 @@ setMethod("show", "SplatParams", function(object) { "Library size:" = c("(Location)" = "lib.loc", "(Scale)" = "lib.scale"), "Exprs outliers:" = c("(Probability)" = "out.prob", - "(Lo Prob)" = "out.loProb", "(Location)" = "out.facLoc", "(Scale)" = "out.facScale"), "Diff expr:" = c("[Probability]" = "de.prob", diff --git a/R/splat-estimate.R b/R/splat-estimate.R index 69ddec5e3b500c46fc8a8b75d698db7135c7d60b..d7a070e660e868e8f5e8001b7b2704a739797dc9 100644 --- a/R/splat-estimate.R +++ b/R/splat-estimate.R @@ -59,19 +59,34 @@ splatEstimate.matrix <- function(counts, params = newSplatParams()) { #' Estimate Splat mean parameters #' #' Estimate rate and shape parameters for the gamma distribution used to -#' simulate gene expression means using the 'moment matching estimation' method -#' of \code{\link[fitdistrplus]{fitdist}}. +#' simulate gene expression means. #' #' @param norm.counts library size normalised counts matrix. #' @param params SplatParams object to store estimated values in. #' +#' @details +#' Parameter for the gamma distribution are estimated by fitting the mean +#' normalised counts using \code{\link[fitdistrplus]{fitdist}}. The 'maximum +#' goodness-of-fit estimation' method is used to minimise the Cramer-von Mises +#' distance. This can fail in some situations, in which case the 'method of +#' moments estimation' method is used instead. Prior to fitting the means are +#' winsorized by setting the top and bottom 10 percent of values to the 10th +#' and 90th percentiles. +#' #' @return SplatParams object with estimated values. splatEstMean <- function(norm.counts, params) { means <- rowMeans(norm.counts) means <- means[means != 0] - fit <- fitdistrplus::fitdist(means, "gamma", method = "mme") + means <- winsorize(means, q = 0.1) + + fit <- try(fitdistrplus::fitdist(means, "gamma", method = "mge", + gof = "CvM")) + if (class(fit) == "try-error") { + warning("Goodness of fit failed, using Method of Moments") + fit <- fitdistrplus::fitdist(means, "gamma", method = "mme") + } params <- setParams(params, mean.shape = unname(fit$estimate["shape"]), mean.rate = unname(fit$estimate["rate"])) @@ -111,15 +126,12 @@ splatEstLib <- function(counts, params) { #' @details #' Expression outlier genes are detected using the Median Absolute Deviation #' (MAD) from median method. If the log2 mean expression of a gene is greater -#' than two MADs from the median log2 mean expression it is designated as a +#' than two MADs above the median log2 mean expression it is designated as an #' outlier. The proportion of outlier genes is used to estimate the outlier -#' probability. The low outlier probability is estimated as the proportion of -#' outlier genes that have a log2 mean less than the median log2 mean. Factors -#' for each outlier gene are calculated by dividing mean expression by the -#' median mean expression. A log-normal distribution is then fitted to these -#' factors in order to estimate the outlier factor location and scale -#' parameters. See \code{\link[fitdistrplus]{fitdist}} for details on the -#' fitting. +#' probability. Factors for each outlier gene are calculated by dividing mean +#' expression by the median mean expression. A log-normal distribution is then +#' fitted to these factors in order to estimate the outlier factor location and +#' scale parameters using \code{\link[fitdistrplus]{fitdist}}. #' #' @return SplatParams object with estimated values. splatEstOutlier <- function(norm.counts, params) { @@ -130,34 +142,42 @@ splatEstOutlier <- function(norm.counts, params) { med <- median(lmeans) mad <- mad(lmeans) - lo.bound <- med - 2 * mad - hi.bound <- med + 2 * mad + bound <- med + 2 * mad + + outs <- which(lmeans > bound) - lo.outs <- which(lmeans < lo.bound) - hi.outs <- which(lmeans > hi.bound) + prob <- length(outs) / nrow(norm.counts) - prob <- (length(lo.outs) + length(hi.outs)) / nrow(norm.counts) - lo.prob <- length(lo.outs) / (length(lo.outs) + length(hi.outs)) + params <- setParams(params, out.prob = prob) - facs <- means[c(lo.outs, hi.outs)] / median(means) - fit <- fitdistrplus::fitdist(facs, "lnorm") + if (length(outs) > 1) { + facs <- means[outs] / median(means) + fit <- fitdistrplus::fitdist(facs, "lnorm") - params <- setParams(params, out.prob = prob, out.loProb = lo.prob, - out.facLoc = unname(fit$estimate["meanlog"]), - out.facScale = unname(fit$estimate["sdlog"])) + params <- setParams(params, + out.facLoc = unname(fit$estimate["meanlog"]), + out.facScale = unname(fit$estimate["sdlog"])) + } return(params) } #' Estimate Splat Biological Coefficient of Variation parameters #' -#' Parameters are estimated using the \code{estimateDisp} function in the -#' \code{edgeR} package. Specifically the common dispersion and prior degrees -#' of freedom. See \code{\link{estimateDisp}} for details. +#' Parameters are estimated using the \code{\link[edgeR]{estimateDisp}} function +#' in the \code{edgeR} package. #' #' @param counts counts matrix to estimate parameters from. #' @param params SplatParams object to store estimated values in. #' +#' @details +#' The \code{\link[edgeR]{estimateDisp}} function is used to estimate the common +#' dispersion and prior degrees of freedom. See +#' \code{\link[edgeR]{estimateDisp}} for details. When estimating parameters on +#' simulated data we found a broadly linear relationship between the true +#' underlying common dispersion and the \code{edgR} estimate, therefore we +#' apply a small correction, \code{disp = 0.1 + 0.25 * edgeR.disp}. +#' #' @return SplatParams object with estimated values. splatEstBCV <- function(counts, params) { @@ -165,7 +185,8 @@ splatEstBCV <- function(counts, params) { design <- matrix(1, ncol(counts), 1) disps <- edgeR::estimateDisp(counts, design = design) - params <- setParams(params, bcv.common = disps$common.dispersion, + params <- setParams(params, + bcv.common = 0.1 + 0.25 * disps$common.dispersion, bcv.df = disps$prior.df) return(params) @@ -224,4 +245,4 @@ splatEstDropout <- function(norm.counts, params) { dropout.shape = shape) return(params) -} \ No newline at end of file +} diff --git a/R/splat-simulate.R b/R/splat-simulate.R index 4e3efb0df119ceaf49aa566a37d355e3fdf32c74..bb54cb6f870cfa097a5761a04ad382d681c960a4 100644 --- a/R/splat-simulate.R +++ b/R/splat-simulate.R @@ -280,7 +280,6 @@ splatSimGeneMeans <- function(sim, params) { mean.shape <- getParam(params, "mean.shape") mean.rate <- getParam(params, "mean.rate") out.prob <- getParam(params, "out.prob") - out.loProb <- getParam(params, "out.loProb") out.facLoc <- getParam(params, "out.facLoc") out.facScale <- getParam(params, "out.facScale") @@ -288,7 +287,7 @@ splatSimGeneMeans <- function(sim, params) { base.means.gene <- rgamma(nGenes, shape = mean.shape, rate = mean.rate) # Add expression outliers - outlier.facs <- getLNormFactors(nGenes, out.prob, out.loProb, out.facLoc, + outlier.facs <- getLNormFactors(nGenes, out.prob, 0, out.facLoc, out.facScale) median.means.gene <- median(base.means.gene) outlier.means <- median.means.gene * outlier.facs diff --git a/R/utils.R b/R/utils.R index 73bb02308363de31fd84e839968846a77d29bca7..39e770d8932c8bb185df020cbd45fc54bdf5a47a 100644 --- a/R/utils.R +++ b/R/utils.R @@ -26,4 +26,27 @@ rbindMatched <- function(df1, df2) { combined <- rbind(df1[, common.names], df2[, common.names]) return(combined) -} \ No newline at end of file +} + +#' Winsorize vector +#' +#' Set outliers in a numeric vector to a specified percentile. +#' +#' @param x Numeric vector to winsorize +#' @param q Percentile to set from each end +#' +#' @return Winsorized numeric vector +winsorize <- function(x, q) { + + checkmate::check_numeric(x, any.missing = FALSE) + checkmate::check_number(q, lower = 0, upper = 1) + + lohi <- stats::quantile(x, c(q, 1 - q), na.rm = TRUE) + + if (diff(lohi) < 0) { lohi <- rev(lohi) } + + x[!is.na(x) & x < lohi[1]] <- lohi[1] + x[!is.na(x) & x > lohi[2]] <- lohi[2] + + return(x) +} diff --git a/man/SplatParams.Rd b/man/SplatParams.Rd index 473507c2e3edc05f9886a1a2af66369e59c2fe69..14bca2a036705ada0b92b2e5084609ed98075a48 100644 --- a/man/SplatParams.Rd +++ b/man/SplatParams.Rd @@ -40,8 +40,6 @@ The Splatter simulation requires the following parameters: \describe{ \item{\code{out.prob}}{Probability that a gene is an expression outlier.} - \item{\code{out.loProb}}{Probability that an expression outlier - gene is lowly expressed.} \item{\code{out.facLoc}}{Location (meanlog) parameter for the expression outlier factor log-normal distribution.} \item{\code{out.facScale}}{Scale (sdlog) parameter for the diff --git a/man/splatEstBCV.Rd b/man/splatEstBCV.Rd index 9b66b1706a9ec2e31aaae4515c704c1a34bc668d..5f81642f3d4f4252c865fd31713520bc3e6fca22 100644 --- a/man/splatEstBCV.Rd +++ b/man/splatEstBCV.Rd @@ -15,7 +15,14 @@ splatEstBCV(counts, params) SplatParams object with estimated values. } \description{ -Parameters are estimated using the \code{estimateDisp} function in the -\code{edgeR} package. Specifically the common dispersion and prior degrees -of freedom. See \code{\link{estimateDisp}} for details. +Parameters are estimated using the \code{\link[edgeR]{estimateDisp}} function +in the \code{edgeR} package. +} +\details{ +The \code{\link[edgeR]{estimateDisp}} function is used to estimate the common +dispersion and prior degrees of freedom. See +\code{\link[edgeR]{estimateDisp}} for details. When estimating parameters on +simulated data we found a broadly linear relationship between the true +underlying common dispersion and the \code{edgR} estimate, therefore we +apply a small correction, \code{disp = 0.1 + 0.25 * edgeR.disp}. } diff --git a/man/splatEstMean.Rd b/man/splatEstMean.Rd index e0892f9308585edab292d8eac1d38dbb1d5be812..c019f6782a4bb61e4ca5605b09d65ad9725a07e9 100644 --- a/man/splatEstMean.Rd +++ b/man/splatEstMean.Rd @@ -16,6 +16,14 @@ SplatParams object with estimated values. } \description{ Estimate rate and shape parameters for the gamma distribution used to -simulate gene expression means using the 'moment matching estimation' method -of \code{\link[fitdistrplus]{fitdist}}. +simulate gene expression means. +} +\details{ +Parameter for the gamma distribution are estimated by fitting the mean +normalised counts using \code{\link[fitdistrplus]{fitdist}}. The 'maximum +goodness-of-fit estimation' method is used to minimise the Cramer-von Mises +distance. This can fail in some situations, in which case the 'method of +moments estimation' method is used instead. Prior to fitting the means are +winsorized by setting the top and bottom 10 percent of values to the 10th +and 90th percentiles. } diff --git a/man/splatEstOutlier.Rd b/man/splatEstOutlier.Rd index 81e56853061557d9b4618004bd8a570a53eb0575..543b6e6337c095b81af6053fa6374d4c89172fa4 100644 --- a/man/splatEstOutlier.Rd +++ b/man/splatEstOutlier.Rd @@ -21,13 +21,10 @@ median mean expression level. \details{ Expression outlier genes are detected using the Median Absolute Deviation (MAD) from median method. If the log2 mean expression of a gene is greater -than two MADs from the median log2 mean expression it is designated as a +than two MADs above the median log2 mean expression it is designated as an outlier. The proportion of outlier genes is used to estimate the outlier -probability. The low outlier probability is estimated as the proportion of -outlier genes that have a log2 mean less than the median log2 mean. Factors -for each outlier gene are calculated by dividing mean expression by the -median mean expression. A log-normal distribution is then fitted to these -factors in order to estimate the outlier factor location and scale -parameters. See \code{\link[fitdistrplus]{fitdist}} for details on the -fitting. +probability. Factors for each outlier gene are calculated by dividing mean +expression by the median mean expression. A log-normal distribution is then +fitted to these factors in order to estimate the outlier factor location and +scale parameters using \code{\link[fitdistrplus]{fitdist}}. } diff --git a/man/winsorize.Rd b/man/winsorize.Rd new file mode 100644 index 0000000000000000000000000000000000000000..ddef66fdb41df7ccc804cf0b4ab160de25434104 --- /dev/null +++ b/man/winsorize.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{winsorize} +\alias{winsorize} +\title{Winsorize vector} +\usage{ +winsorize(x, q) +} +\arguments{ +\item{x}{Numeric vector to winsorize} + +\item{q}{Percentile to set from each end} +} +\value{ +Winsorized numeric vector +} +\description{ +Set outliers in a numeric vector to a specified percentile. +} diff --git a/vignettes/splatter.Rmd b/vignettes/splatter.Rmd index 8c1a8152b67529fda8f91979425c21a05df4cb57..03eb98f35c1ec9b7494f15f0be590b1a49f53977 100644 --- a/vignettes/splatter.Rmd +++ b/vignettes/splatter.Rmd @@ -102,8 +102,6 @@ The parameters required for the Splat simulation are briefly described here: distribution. * **Expression outlier parameters** * `out.prob` - Probability that a gene is an expression outlier. - * `out.loProb` - Probability that an expression outlier gene is lowly - expressed (other outliers are highly expressed). * `out.facLoc` - Location (meanlog) parameter for the expression outlier factor log-normal distribution. * `out.facScale` - Scale (sdlog) parameter for the expression outlier factor