Newer
Older
#' Estimate Lun2 simulation parameters
#'
#' Estimate simulation parameters for the Lun2 simulation from a real dataset.
#'
#' @param counts either a counts matrix or an SCESet object containing count
#' data to estimate parameters from.
#' @param plates integer vector giving the plate that each cell originated from.
#' @param params Lun2Params object to store estimated values in.
#' @param min.size minimum size of clusters when identifying group of cells in
#' the data.
#' @param verbose logical. Whether to show progress messages.
#' @param BPPARAM A \code{\link[BiocParallel]{BiocParallelParam}} instance
#' giving the parallel back-end to be used. Default is
#' \code{\link[BiocParallel]{SerialParam}} which uses a single core.
#'
#' @details
#' See \code{\link{Lun2Params}} for more details on the parameters.
#'
#' @return LunParams object containing the estimated parameters.
#'
#' @examples
#' \dontrun{
#' data("sc_example_counts")
#' data("sc_example_cell_info")
#' plates <- factor(sc_example_cell_info$Mutation_Status)
#' params <- lun2Estimate(sc_example_counts, plates, min.size = 20)
#' params
#' }
#' @importFrom BiocParallel bplapply SerialParam
#' @export
lun2Estimate <- function(counts, plates, params = newLun2Params(),
min.size = 200, verbose = TRUE,
BPPARAM = SerialParam()) {
UseMethod("lun2Estimate")
}
#' @rdname lun2Estimate
#' @export
lun2Estimate.SCESet <- function(counts, plates, params = newLun2Params(),
min.size = 200, verbose = TRUE,
BPPARAM = SerialParam()) {
counts <- scater::counts(counts)
lun2Estimate(counts, plates, params, min.size = min.size, verbose = verbose)
}
#' @rdname lun2Estimate
#' @importFrom stats model.matrix
#' @importFrom locfit locfit
#' @export
lun2Estimate.matrix <- function(counts, plates, params = newLun2Params(),
min.size = 200, verbose = TRUE,
BPPARAM = SerialParam()) {
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# Check suggested packages
if (!requireNamespace("scran", quietly = TRUE)) {
stop("The Lun2 simulation requires the 'scran' package for estimation.")
}
if (!requireNamespace("lme4", quietly = TRUE)) {
stop("The Lun2 simulation requires the 'lme4' package for estimation.")
}
if (!requireNamespace("pscl", quietly = TRUE)) {
stop("The Lun2 simulation requires the 'pscl' package for estimation.")
}
progress <- FALSE
if (requireNamespace("progress", quietly = TRUE)) {
progress <- TRUE
}
progress <- progress && verbose
checkmate::assertClass(params, "Lun2Params")
checkmate::assertInt(min.size, lower = 20, upper = length(plates))
checkmate::assertIntegerish(plates, len = ncol(counts))
if (length(unique(plates)) < 2) {
stop("Plates must contain at least 2 values.")
}
dge <- edgeR::DGEList(counts)
dge <- edgeR::`[.DGEList`(dge, rowMeans(dge$counts) >= 1, )
# Estimate how many groups there are in the data
if (verbose) {message("Estimating number of groups...")}
groups <- scran::quickCluster(dge$counts, min.size)
# Calculate normalisation factors
if (verbose) {message("Computing normalisation factors...")}
# Get the sizes for normalisation based on the number of cells in each
# cluster
min.cluster.size <- min(table(groups))
if (min.cluster.size < 20) {
sizes <- 10
} else if (min.cluster.size < 100) {
sizes <- seq(20, min.size, 20)
} else {
sizes <- seq(20, 100, 20)
}
sum.facs <- scran::computeSumFactors(dge$counts, cluster = groups,
sizes = sizes)
dge$samples$norm.factors <- sum.facs / dge$samples$lib.size
# Mean centre normalisation factors
dge$samples$norm.factors <- dge$samples$norm.factors /
exp(mean(log(dge$samples$norm.factors)))
# Estimating the NB dispersion (assuming sufficient residual d.f. to
# estimate the dispersion without EB shrinkage).
if (verbose) {message("Estimating dispersions...")}
plateX <- model.matrix(~plates)
dge <- edgeR::estimateDisp(dge, plateX, prior.df = 0, trend = "none")
# Estimating the log-overall mean
if (verbose) {message("Estimating gene means...")}
centered.off <- edgeR::getOffset(dge)
centered.off <- centered.off - mean(centered.off)
logmeans <- edgeR::mglmOneGroup(dge$counts, offset = centered.off,
dispersion = dge$tagwise.dispersion)
# Estimating the plate effect variance
if (verbose) {message("Estimating plate effects...")}
if (progress) {
pb.format <- "[:bar] :percent eta: :eta"
pb <- progress::progress_bar$new(format = pb.format,
total = nrow(dge), clear = FALSE)
pb$tick(0)
} else if (verbose) {
message("This may take some time. Install 'progress' to see a ",
"progress bar.")
}
# As well as errors glmer produces warnings. Stop these showing because we
# expect them.
suppressWarnings(
collected <- bplapply(seq_len(nrow(dge)), function(i) {
if (progress) {pb$tick()}
tryCatch({
out <- lme4::glmer(
Counts ~ 0 + (1 | Plate) + offset(log(sum.facs)),
data = data.frame(Counts = as.integer(counts[i, ]),
Group = groups,
Plate = plates),
family = lme4::negative.binomial(1 / dge$tagwise[i]))
output <- unlist(lme4::VarCorr(out))
return(output)
}, error = function(err) {
output <- NA_real_
return(output)
})
sigma2 <- mean(unlist(collected), na.rm = TRUE)
# Repeating the estimation of the dispersion with ZINB models.
if (verbose) {message("Estimating zero-inflated parameters...")}
zinb.prop <- rep(-Inf, nrow(dge))
zinb.disp <- dge$tagwise.dispersion
zinb.mean <- exp(logmeans)
nonzeros <- which(rowSums(dge$counts == 0) > 0)
if (progress) {
pb <- progress::progress_bar$new(format = "[:bar] :percent eta: :eta",
total = length(nonzeros),
clear = FALSE)
pb$tick(0)
} else if (verbose) {
message("This may take some time. Install 'progress' to see a ",
"progress bar.")
}
zinb.ests <- bplapply(nonzeros, function(i) {
if (progress) {pb$tick()}
zinb.est <- c(mean = zinb.mean[i], prop = zinb.prop[i],
disp = zinb.disp[i])
tryCatch({
zfit <- pscl::zeroinfl(dge$count[i, ] ~ 0 + plates | 1,
dist = "negbin", offset = log(sum.facs))
zinb.est <- c(mean = mean(exp(zfit$coefficients$count)),
prop = unname(zfit$coefficients$zero),
disp = 1 / zfit$theta)
}, error = function(err) {})
return(zinb.est)
}, BPPARAM = BPPARAM)
zinb.ests <- do.call("rbind", zinb.ests)
zinb.prop[nonzeros] <- zinb.ests[, "prop"]
zinb.disp[nonzeros] <- zinb.ests[, "disp"]
zinb.mean[nonzeros] <- zinb.ests[, "mean"]
zinb.prop <- exp(zinb.prop) / (1 + exp(zinb.prop))
params <- setParams(params, nGenes = nrow(counts),
cell.plates = plates, plate.var = sigma2,
gene.means = exp(logmeans),
gene.disps = dge$tagwise.dispersion,
gene.ziMeans = zinb.mean, gene.ziDisps = zinb.disp,
gene.ziProps = zinb.prop,
cell.libSizes = dge$samples$lib.size)
return(params)