Newer
Older
#' Lun2 simulation
#'
#' Simulate single-cell RNA-seq count data using the method described in Lun
#' and Marioni "Overcoming confounding plate effects in differential expression
#' analyses of single-cell RNA-seq data".
#'
#' @param params Lun2Params object containing simulation parameters.
#' @param zinb logical. Whether to use a zero-inflated model.
#' @param verbose logical. Whether to print progress messages
#' @param ... any additional parameter settings to override what is provided in
#' \code{params}.
#'
#' @details
#' The Lun2 simulation uses a negative-binomial distribution where the means and
#' dispersions have been sampled from a real dataset
#' (using \code{\link{lun2Estimate}}). The other core feature of the Lun2
#' simulation is the addition of plate effects. Differential expression can be
#' added between two groups of plates (an "ingroup" and all other plates).
#' Library size factors are also applied and optionally a zero-inflated
#' negative-binomial can be used.
#' If the number of genes to simulate differs from the number of provied gene
#' parameters or the number of cells to simulate differs from the number of
#' library sizes the relevant paramters will be sampled with a warning. This
#' allows any number of genes or cells to be simulated regardless of the
#' number in the dataset used in the estimation step but has the downside that
#' some genes or cells may be simulated multiple times.
#'
#'
#' @references
#' Lun ATL, Marioni JC. Overcoming confounding plate effects in differential
#' expression analyses of single-cell RNA-seq data. bioRxiv (2016).
#'
#' Paper: \url{dx.doi.org/10.1101/073973}
#'
#' Code: \url{https://github.com/MarioniLab/PlateEffects2016}
#'
#' @examples
#' sim <- lun2Simulate()
#' @export
#' @importFrom methods new
#' @importFrom scater newSCESet set_exprs<-
lun2Simulate <- function(params = newLun2Params(), zinb = FALSE,
verbose = TRUE, ...) {
checkmate::assertClass(params, "Lun2Params")
params <- setParams(params, ...)
# Set random seed
seed <- getParam(params, "seed")
set.seed(seed)
if (verbose) {message("Getting parameters...")}
nGenes <- getParam(params, "nGenes")
nCells <- getParam(params, "nCells")
nPlates <- getParam(params, "nPlates")
cell.plates <- getParam(params, "cell.plates")
plate.var <- getParam(params, "plate.var") / getParam(params, "plate.mod")
lib.sizes <- getParam(params, "cell.libSizes")
lib.mod <- getParam(params, "cell.libMod")
# Sample lib.sizes if necessary
if (length(lib.sizes) != nCells) {
warning("Number of lib.sizes not equal to nCells. ",
"lib.sizes will be sampled.")
selected <- sample(length(lib.sizes), nCells, replace = TRUE)
libSizes <- lib.sizes[selected]
}
if (zinb) {
gene.params <- getParam(params, "zi.params")
gene.params <- getParam(params, "gene.params")
}
# Sample gene parameters if necessary
if (nrow(gene.params) != nGenes) {
warning("Number of gene parameters does not equal nGenes. ",
"Gene parameters will be sampled.")
selected <- sample(nrow(gene.params), nGenes, replace = TRUE)
gene.params <- gene.params[selected, ]
gene.means <- gene.params$Mean
gene.disps <- gene.params$Disp
if (zinb) {
gene.ziProps <- gene.params$Prop
}
#if (name == "nGenes") {
# old.nGenes <- getParam(object, "nGenes")
# if (value != old.nGenes) {
# warning("nGenes has been changed. Gene parameter vectors will be ",
# "sampled to length new nGenes.")
# selected <- sample(seq_len(old.nGenes), size = value,
# replace = TRUE)
# for (parameter in grep("gene", slotNames(object), value = TRUE)) {
# old.value <- getParam(object, parameter)
# object <- setParamUnchecked(object, parameter,
# old.value[selected])
# }
# }
#}
cell.names <- paste0("Cell", seq_len(nCells))
gene.names <- paste0("Gene", seq_len(nGenes))
features <- new("AnnotatedDataFrame",
data = data.frame(Gene = gene.names, GeneMean = gene.means,
GeneDisp = gene.disps))
phenos <- new("AnnotatedDataFrame",
data = data.frame(Cell = cell.names, Plate = cell.plates))
if (zinb) {
features$GeneZeroProp <- gene.ziProps
}
if (verbose) {message("Simulating plate means...")}
plate.facs <- matrix(exp(rnorm(nGenes * nPlates, mean = -plate.var / 2,
sd = sqrt(plate.var))),
nrow = nGenes, ncol = nPlates)
base.plate.means <- gene.means * plate.facs
if (de.nGenes > 0) {
title <- "BaseGeneMeanPlate"
} else {
title <- "GeneMeanPlate"
}
features[[paste0("PlateFacPlate", idx)]] <- plate.facs[, idx]
features[[paste0(title, idx)]] <- base.plate.means[, idx]
}
plate.means <- base.plate.means
if (de.nGenes > 0) {
if (verbose) {message("Simulating differential expression...")}
plate.ingroup <- getParam(params, "plate.ingroup")
de.fc <- sqrt(getParam(params, "de.fc"))
ingroup <- match(plate.ingroup, levels(cell.plates))
de.chosen <- sample(nGenes, de.nGenes)
de.isUp <- rep(c(TRUE, FALSE), length.out = de.nGenes)
de.facs <- rep(1, nGenes)
de.facs[de.chosen[de.isUp]] <- de.fc
de.facs[de.chosen[!de.isUp]] <- 1 / de.fc
plate.means[, ingroup] <- plate.means[, ingroup] * de.facs
plate.means[, -ingroup] <- plate.means[, -ingroup] * (1 / de.facs)
phenos$Ingroup <- cell.plates %in% plate.ingroup
features$DEFacIngroup <- de.facs
features$DEFacOutgroup <- 1 / de.facs
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
features[[paste0("GeneMeanPlate", idx)]] <- plate.means[, idx]
}
}
if (verbose) {message("Simulating libray size factors...")}
lib.facs <- lib.sizes / mean(lib.sizes)
lib.facs <- sample(lib.facs, nCells, replace = TRUE) * lib.mod
phenos$LibSizeFac <- lib.facs
if (verbose) {message("Simulating cell means...")}
cell.means <- plate.means[, as.integer(cell.plates)]
cell.means <- t(t(cell.means) * lib.facs)
if (verbose) {message("Simulating counts...")}
true.counts <- matrix(rnbinom(nCells * nGenes, mu = cell.means,
size = 1 / gene.disps),
ncol = nCells, nrow = nGenes)
counts <- true.counts
if (zinb) {
if (verbose) {message("Simulating zero inflation...")}
is.zero <- matrix(rbinom(nCells * nGenes, 1, gene.ziProps),
ncol = nCells, nrow = nGenes) == 1
counts[is.zero] <- 0
}
if (verbose) {message("Creating final SCESet...")}
rownames(phenos) <- cell.names
rownames(features) <- gene.names
rownames(counts) <- gene.names
colnames(counts) <- cell.names
sim <- newSCESet(countData = counts, phenoData = phenos,
featureData = features)
rownames(cell.means) <- gene.names
colnames(cell.means) <- cell.names
set_exprs(sim, "CellMeans") <- cell.means
rownames(true.counts) <- gene.names
colnames(true.counts) <- cell.names
set_exprs(sim, "TrueCounts") <- true.counts
if (zinb) {
rownames(is.zero) <- gene.names
colnames(is.zero) <- cell.names
set_exprs(sim, "ZeroInflation") <- is.zero