Skip to content
Snippets Groups Projects
Commit 3065d121 authored by Luke Zappia's avatar Luke Zappia
Browse files

Merge branch 'est'

* est:
  Remove out.loProb from vignette
  Update default Splat parameters
  Remove out.loProb parameter
  Update estimation procedure
parents 45ff5f57 3227bb24
No related branches found
No related tags found
No related merge requests found
...@@ -96,8 +96,6 @@ setClass("SimpleParams", ...@@ -96,8 +96,6 @@ setClass("SimpleParams",
#' \describe{ #' \describe{
#' \item{\code{out.prob}}{Probability that a gene is an expression #' \item{\code{out.prob}}{Probability that a gene is an expression
#' outlier.} #' outlier.}
#' \item{\code{out.loProb}}{Probability that an expression outlier
#' gene is lowly expressed.}
#' \item{\code{out.facLoc}}{Location (meanlog) parameter for the #' \item{\code{out.facLoc}}{Location (meanlog) parameter for the
#' expression outlier factor log-normal distribution.} #' expression outlier factor log-normal distribution.}
#' \item{\code{out.facScale}}{Scale (sdlog) parameter for the #' \item{\code{out.facScale}}{Scale (sdlog) parameter for the
...@@ -182,7 +180,6 @@ setClass("SplatParams", ...@@ -182,7 +180,6 @@ setClass("SplatParams",
lib.loc = "numeric", lib.loc = "numeric",
lib.scale = "numeric", lib.scale = "numeric",
out.prob = "numeric", out.prob = "numeric",
out.loProb = "numeric",
out.facLoc = "numeric", out.facLoc = "numeric",
out.facScale = "numeric", out.facScale = "numeric",
de.prob = "numeric", de.prob = "numeric",
...@@ -202,20 +199,19 @@ setClass("SplatParams", ...@@ -202,20 +199,19 @@ setClass("SplatParams",
prototype = prototype(nGroups = 1, prototype = prototype(nGroups = 1,
groupCells = 100, groupCells = 100,
mean.rate = 0.3, mean.rate = 0.3,
mean.shape = 0.4, mean.shape = 0.6,
lib.loc = 10, lib.loc = 11,
lib.scale = 0.5, lib.scale = 0.2,
out.prob = 0.1, out.prob = 0.05,
out.loProb = 0.5,
out.facLoc = 4, out.facLoc = 4,
out.facScale = 1, out.facScale = 0.5,
de.prob = 0.1, de.prob = 0.1,
de.downProb = 0.5, de.downProb = 0.5,
de.facLoc = 4, de.facLoc = 4,
de.facScale = 1, de.facScale = 1,
bcv.common = 0.1, bcv.common = 0.1,
bcv.df = 25, bcv.df = 60,
dropout.present = TRUE, dropout.present = FALSE,
dropout.mid = 0, dropout.mid = 0,
dropout.shape = -1, dropout.shape = -1,
path.from = 0, path.from = 0,
...@@ -438,4 +434,4 @@ setClass("SCDDParams", ...@@ -438,4 +434,4 @@ setClass("SCDDParams",
sd.range = c(1, 3), sd.range = c(1, 3),
modeFC = c(2, 3, 4), modeFC = c(2, 3, 4),
varInflation = c(1, 1), varInflation = c(1, 1),
condition = "condition")) condition = "condition"))
\ No newline at end of file
...@@ -27,11 +27,6 @@ setValidity("SplatParams", function(object) { ...@@ -27,11 +27,6 @@ setValidity("SplatParams", function(object) {
lib.loc = checkNumber(v$lib.loc), lib.loc = checkNumber(v$lib.loc),
lib.scale = checkNumber(v$lib.scale, lower = 0), lib.scale = checkNumber(v$lib.scale, lower = 0),
out.prob = checkNumber(v$out.prob, lower = 0, upper = 1), out.prob = checkNumber(v$out.prob, lower = 0, upper = 1),
out.loProb = checkNumber(v$out.loProb, lower = 0, upper = 1),
out.facLoc = checkNumber(v$lib.loc),
out.facScale = checkNumber(v$lib.scale, lower = 0),
out.prob = checkNumber(v$out.prob, lower = 0, upper = 1),
out.loProb = checkNumber(v$out.loProb, lower = 0, upper = 1),
out.facLoc = checkNumber(v$out.facLoc), out.facLoc = checkNumber(v$out.facLoc),
out.facScale = checkNumber(v$out.facScale, lower = 0), out.facScale = checkNumber(v$out.facScale, lower = 0),
de.prob = checkNumeric(v$de.prob, lower = 0, upper = 1, de.prob = checkNumeric(v$de.prob, lower = 0, upper = 1,
...@@ -108,7 +103,6 @@ setMethod("show", "SplatParams", function(object) { ...@@ -108,7 +103,6 @@ setMethod("show", "SplatParams", function(object) {
"Library size:" = c("(Location)" = "lib.loc", "Library size:" = c("(Location)" = "lib.loc",
"(Scale)" = "lib.scale"), "(Scale)" = "lib.scale"),
"Exprs outliers:" = c("(Probability)" = "out.prob", "Exprs outliers:" = c("(Probability)" = "out.prob",
"(Lo Prob)" = "out.loProb",
"(Location)" = "out.facLoc", "(Location)" = "out.facLoc",
"(Scale)" = "out.facScale"), "(Scale)" = "out.facScale"),
"Diff expr:" = c("[Probability]" = "de.prob", "Diff expr:" = c("[Probability]" = "de.prob",
......
...@@ -59,19 +59,34 @@ splatEstimate.matrix <- function(counts, params = newSplatParams()) { ...@@ -59,19 +59,34 @@ splatEstimate.matrix <- function(counts, params = newSplatParams()) {
#' Estimate Splat mean parameters #' Estimate Splat mean parameters
#' #'
#' Estimate rate and shape parameters for the gamma distribution used to #' Estimate rate and shape parameters for the gamma distribution used to
#' simulate gene expression means using the 'moment matching estimation' method #' simulate gene expression means.
#' of \code{\link[fitdistrplus]{fitdist}}.
#' #'
#' @param norm.counts library size normalised counts matrix. #' @param norm.counts library size normalised counts matrix.
#' @param params SplatParams object to store estimated values in. #' @param params SplatParams object to store estimated values in.
#' #'
#' @details
#' Parameter for the gamma distribution are estimated by fitting the mean
#' normalised counts using \code{\link[fitdistrplus]{fitdist}}. The 'maximum
#' goodness-of-fit estimation' method is used to minimise the Cramer-von Mises
#' distance. This can fail in some situations, in which case the 'method of
#' moments estimation' method is used instead. Prior to fitting the means are
#' winsorized by setting the top and bottom 10 percent of values to the 10th
#' and 90th percentiles.
#'
#' @return SplatParams object with estimated values. #' @return SplatParams object with estimated values.
splatEstMean <- function(norm.counts, params) { splatEstMean <- function(norm.counts, params) {
means <- rowMeans(norm.counts) means <- rowMeans(norm.counts)
means <- means[means != 0] means <- means[means != 0]
fit <- fitdistrplus::fitdist(means, "gamma", method = "mme") means <- winsorize(means, q = 0.1)
fit <- try(fitdistrplus::fitdist(means, "gamma", method = "mge",
gof = "CvM"))
if (class(fit) == "try-error") {
warning("Goodness of fit failed, using Method of Moments")
fit <- fitdistrplus::fitdist(means, "gamma", method = "mme")
}
params <- setParams(params, mean.shape = unname(fit$estimate["shape"]), params <- setParams(params, mean.shape = unname(fit$estimate["shape"]),
mean.rate = unname(fit$estimate["rate"])) mean.rate = unname(fit$estimate["rate"]))
...@@ -111,15 +126,12 @@ splatEstLib <- function(counts, params) { ...@@ -111,15 +126,12 @@ splatEstLib <- function(counts, params) {
#' @details #' @details
#' Expression outlier genes are detected using the Median Absolute Deviation #' Expression outlier genes are detected using the Median Absolute Deviation
#' (MAD) from median method. If the log2 mean expression of a gene is greater #' (MAD) from median method. If the log2 mean expression of a gene is greater
#' than two MADs from the median log2 mean expression it is designated as a #' than two MADs above the median log2 mean expression it is designated as an
#' outlier. The proportion of outlier genes is used to estimate the outlier #' outlier. The proportion of outlier genes is used to estimate the outlier
#' probability. The low outlier probability is estimated as the proportion of #' probability. Factors for each outlier gene are calculated by dividing mean
#' outlier genes that have a log2 mean less than the median log2 mean. Factors #' expression by the median mean expression. A log-normal distribution is then
#' for each outlier gene are calculated by dividing mean expression by the #' fitted to these factors in order to estimate the outlier factor location and
#' median mean expression. A log-normal distribution is then fitted to these #' scale parameters using \code{\link[fitdistrplus]{fitdist}}.
#' factors in order to estimate the outlier factor location and scale
#' parameters. See \code{\link[fitdistrplus]{fitdist}} for details on the
#' fitting.
#' #'
#' @return SplatParams object with estimated values. #' @return SplatParams object with estimated values.
splatEstOutlier <- function(norm.counts, params) { splatEstOutlier <- function(norm.counts, params) {
...@@ -130,34 +142,42 @@ splatEstOutlier <- function(norm.counts, params) { ...@@ -130,34 +142,42 @@ splatEstOutlier <- function(norm.counts, params) {
med <- median(lmeans) med <- median(lmeans)
mad <- mad(lmeans) mad <- mad(lmeans)
lo.bound <- med - 2 * mad bound <- med + 2 * mad
hi.bound <- med + 2 * mad
outs <- which(lmeans > bound)
lo.outs <- which(lmeans < lo.bound) prob <- length(outs) / nrow(norm.counts)
hi.outs <- which(lmeans > hi.bound)
prob <- (length(lo.outs) + length(hi.outs)) / nrow(norm.counts) params <- setParams(params, out.prob = prob)
lo.prob <- length(lo.outs) / (length(lo.outs) + length(hi.outs))
facs <- means[c(lo.outs, hi.outs)] / median(means) if (length(outs) > 1) {
fit <- fitdistrplus::fitdist(facs, "lnorm") facs <- means[outs] / median(means)
fit <- fitdistrplus::fitdist(facs, "lnorm")
params <- setParams(params, out.prob = prob, out.loProb = lo.prob, params <- setParams(params,
out.facLoc = unname(fit$estimate["meanlog"]), out.facLoc = unname(fit$estimate["meanlog"]),
out.facScale = unname(fit$estimate["sdlog"])) out.facScale = unname(fit$estimate["sdlog"]))
}
return(params) return(params)
} }
#' Estimate Splat Biological Coefficient of Variation parameters #' Estimate Splat Biological Coefficient of Variation parameters
#' #'
#' Parameters are estimated using the \code{estimateDisp} function in the #' Parameters are estimated using the \code{\link[edgeR]{estimateDisp}} function
#' \code{edgeR} package. Specifically the common dispersion and prior degrees #' in the \code{edgeR} package.
#' of freedom. See \code{\link{estimateDisp}} for details.
#' #'
#' @param counts counts matrix to estimate parameters from. #' @param counts counts matrix to estimate parameters from.
#' @param params SplatParams object to store estimated values in. #' @param params SplatParams object to store estimated values in.
#' #'
#' @details
#' The \code{\link[edgeR]{estimateDisp}} function is used to estimate the common
#' dispersion and prior degrees of freedom. See
#' \code{\link[edgeR]{estimateDisp}} for details. When estimating parameters on
#' simulated data we found a broadly linear relationship between the true
#' underlying common dispersion and the \code{edgR} estimate, therefore we
#' apply a small correction, \code{disp = 0.1 + 0.25 * edgeR.disp}.
#'
#' @return SplatParams object with estimated values. #' @return SplatParams object with estimated values.
splatEstBCV <- function(counts, params) { splatEstBCV <- function(counts, params) {
...@@ -165,7 +185,8 @@ splatEstBCV <- function(counts, params) { ...@@ -165,7 +185,8 @@ splatEstBCV <- function(counts, params) {
design <- matrix(1, ncol(counts), 1) design <- matrix(1, ncol(counts), 1)
disps <- edgeR::estimateDisp(counts, design = design) disps <- edgeR::estimateDisp(counts, design = design)
params <- setParams(params, bcv.common = disps$common.dispersion, params <- setParams(params,
bcv.common = 0.1 + 0.25 * disps$common.dispersion,
bcv.df = disps$prior.df) bcv.df = disps$prior.df)
return(params) return(params)
...@@ -224,4 +245,4 @@ splatEstDropout <- function(norm.counts, params) { ...@@ -224,4 +245,4 @@ splatEstDropout <- function(norm.counts, params) {
dropout.shape = shape) dropout.shape = shape)
return(params) return(params)
} }
\ No newline at end of file
...@@ -280,7 +280,6 @@ splatSimGeneMeans <- function(sim, params) { ...@@ -280,7 +280,6 @@ splatSimGeneMeans <- function(sim, params) {
mean.shape <- getParam(params, "mean.shape") mean.shape <- getParam(params, "mean.shape")
mean.rate <- getParam(params, "mean.rate") mean.rate <- getParam(params, "mean.rate")
out.prob <- getParam(params, "out.prob") out.prob <- getParam(params, "out.prob")
out.loProb <- getParam(params, "out.loProb")
out.facLoc <- getParam(params, "out.facLoc") out.facLoc <- getParam(params, "out.facLoc")
out.facScale <- getParam(params, "out.facScale") out.facScale <- getParam(params, "out.facScale")
...@@ -288,7 +287,7 @@ splatSimGeneMeans <- function(sim, params) { ...@@ -288,7 +287,7 @@ splatSimGeneMeans <- function(sim, params) {
base.means.gene <- rgamma(nGenes, shape = mean.shape, rate = mean.rate) base.means.gene <- rgamma(nGenes, shape = mean.shape, rate = mean.rate)
# Add expression outliers # Add expression outliers
outlier.facs <- getLNormFactors(nGenes, out.prob, out.loProb, out.facLoc, outlier.facs <- getLNormFactors(nGenes, out.prob, 0, out.facLoc,
out.facScale) out.facScale)
median.means.gene <- median(base.means.gene) median.means.gene <- median(base.means.gene)
outlier.means <- median.means.gene * outlier.facs outlier.means <- median.means.gene * outlier.facs
......
...@@ -26,4 +26,27 @@ rbindMatched <- function(df1, df2) { ...@@ -26,4 +26,27 @@ rbindMatched <- function(df1, df2) {
combined <- rbind(df1[, common.names], df2[, common.names]) combined <- rbind(df1[, common.names], df2[, common.names])
return(combined) return(combined)
} }
\ No newline at end of file
#' Winsorize vector
#'
#' Set outliers in a numeric vector to a specified percentile.
#'
#' @param x Numeric vector to winsorize
#' @param q Percentile to set from each end
#'
#' @return Winsorized numeric vector
winsorize <- function(x, q) {
checkmate::check_numeric(x, any.missing = FALSE)
checkmate::check_number(q, lower = 0, upper = 1)
lohi <- stats::quantile(x, c(q, 1 - q), na.rm = TRUE)
if (diff(lohi) < 0) { lohi <- rev(lohi) }
x[!is.na(x) & x < lohi[1]] <- lohi[1]
x[!is.na(x) & x > lohi[2]] <- lohi[2]
return(x)
}
...@@ -40,8 +40,6 @@ The Splatter simulation requires the following parameters: ...@@ -40,8 +40,6 @@ The Splatter simulation requires the following parameters:
\describe{ \describe{
\item{\code{out.prob}}{Probability that a gene is an expression \item{\code{out.prob}}{Probability that a gene is an expression
outlier.} outlier.}
\item{\code{out.loProb}}{Probability that an expression outlier
gene is lowly expressed.}
\item{\code{out.facLoc}}{Location (meanlog) parameter for the \item{\code{out.facLoc}}{Location (meanlog) parameter for the
expression outlier factor log-normal distribution.} expression outlier factor log-normal distribution.}
\item{\code{out.facScale}}{Scale (sdlog) parameter for the \item{\code{out.facScale}}{Scale (sdlog) parameter for the
......
...@@ -15,7 +15,14 @@ splatEstBCV(counts, params) ...@@ -15,7 +15,14 @@ splatEstBCV(counts, params)
SplatParams object with estimated values. SplatParams object with estimated values.
} }
\description{ \description{
Parameters are estimated using the \code{estimateDisp} function in the Parameters are estimated using the \code{\link[edgeR]{estimateDisp}} function
\code{edgeR} package. Specifically the common dispersion and prior degrees in the \code{edgeR} package.
of freedom. See \code{\link{estimateDisp}} for details. }
\details{
The \code{\link[edgeR]{estimateDisp}} function is used to estimate the common
dispersion and prior degrees of freedom. See
\code{\link[edgeR]{estimateDisp}} for details. When estimating parameters on
simulated data we found a broadly linear relationship between the true
underlying common dispersion and the \code{edgR} estimate, therefore we
apply a small correction, \code{disp = 0.1 + 0.25 * edgeR.disp}.
} }
...@@ -16,6 +16,14 @@ SplatParams object with estimated values. ...@@ -16,6 +16,14 @@ SplatParams object with estimated values.
} }
\description{ \description{
Estimate rate and shape parameters for the gamma distribution used to Estimate rate and shape parameters for the gamma distribution used to
simulate gene expression means using the 'moment matching estimation' method simulate gene expression means.
of \code{\link[fitdistrplus]{fitdist}}. }
\details{
Parameter for the gamma distribution are estimated by fitting the mean
normalised counts using \code{\link[fitdistrplus]{fitdist}}. The 'maximum
goodness-of-fit estimation' method is used to minimise the Cramer-von Mises
distance. This can fail in some situations, in which case the 'method of
moments estimation' method is used instead. Prior to fitting the means are
winsorized by setting the top and bottom 10 percent of values to the 10th
and 90th percentiles.
} }
...@@ -21,13 +21,10 @@ median mean expression level. ...@@ -21,13 +21,10 @@ median mean expression level.
\details{ \details{
Expression outlier genes are detected using the Median Absolute Deviation Expression outlier genes are detected using the Median Absolute Deviation
(MAD) from median method. If the log2 mean expression of a gene is greater (MAD) from median method. If the log2 mean expression of a gene is greater
than two MADs from the median log2 mean expression it is designated as a than two MADs above the median log2 mean expression it is designated as an
outlier. The proportion of outlier genes is used to estimate the outlier outlier. The proportion of outlier genes is used to estimate the outlier
probability. The low outlier probability is estimated as the proportion of probability. Factors for each outlier gene are calculated by dividing mean
outlier genes that have a log2 mean less than the median log2 mean. Factors expression by the median mean expression. A log-normal distribution is then
for each outlier gene are calculated by dividing mean expression by the fitted to these factors in order to estimate the outlier factor location and
median mean expression. A log-normal distribution is then fitted to these scale parameters using \code{\link[fitdistrplus]{fitdist}}.
factors in order to estimate the outlier factor location and scale
parameters. See \code{\link[fitdistrplus]{fitdist}} for details on the
fitting.
} }
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utils.R
\name{winsorize}
\alias{winsorize}
\title{Winsorize vector}
\usage{
winsorize(x, q)
}
\arguments{
\item{x}{Numeric vector to winsorize}
\item{q}{Percentile to set from each end}
}
\value{
Winsorized numeric vector
}
\description{
Set outliers in a numeric vector to a specified percentile.
}
...@@ -102,8 +102,6 @@ The parameters required for the Splat simulation are briefly described here: ...@@ -102,8 +102,6 @@ The parameters required for the Splat simulation are briefly described here:
distribution. distribution.
* **Expression outlier parameters** * **Expression outlier parameters**
* `out.prob` - Probability that a gene is an expression outlier. * `out.prob` - Probability that a gene is an expression outlier.
* `out.loProb` - Probability that an expression outlier gene is lowly
expressed (other outliers are highly expressed).
* `out.facLoc` - Location (meanlog) parameter for the expression outlier * `out.facLoc` - Location (meanlog) parameter for the expression outlier
factor log-normal distribution. factor log-normal distribution.
* `out.facScale` - Scale (sdlog) parameter for the expression outlier factor * `out.facScale` - Scale (sdlog) parameter for the expression outlier factor
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment