Skip to content
Snippets Groups Projects
Commit 16cdf173 authored by Luke Zappia's avatar Luke Zappia
Browse files

Modify splatSimulate to use SplatParams

parent 9b75eea8
No related branches found
No related tags found
No related merge requests found
Package: splatter Package: splatter
Type: Package Type: Package
Title: Simple Simulation of Single-cell RNA Sequencing Data Title: Simple Simulation of Single-cell RNA Sequencing Data
Version: 0.6.7 Version: 0.6.8
Date: 2016-10-13 Date: 2016-10-13
Author: Luke Zappia Author: Luke Zappia
Authors@R: as.person(c( Authors@R: as.person(c(
......
...@@ -13,14 +13,31 @@ export(newSplatParams) ...@@ -13,14 +13,31 @@ export(newSplatParams)
export(setParam) export(setParam)
export(setParams) export(setParams)
export(simpleSimulate) export(simpleSimulate)
export(splatSimulate)
export(splatSimulateGroups)
export(splatSimulatePaths)
export(splatSimulateSingle)
exportClasses(SimpleParams) exportClasses(SimpleParams)
exportClasses(SplatParams) exportClasses(SplatParams)
importFrom(Biobase,"assayData<-")
importFrom(Biobase,"fData<-")
importFrom(Biobase,"pData<-")
importFrom(Biobase,assayData)
importFrom(Biobase,fData)
importFrom(Biobase,pData)
importFrom(checkmate,checkFlag) importFrom(checkmate,checkFlag)
importFrom(checkmate,checkInt) importFrom(checkmate,checkInt)
importFrom(checkmate,checkIntegerish) importFrom(checkmate,checkIntegerish)
importFrom(checkmate,checkNumber) importFrom(checkmate,checkNumber)
importFrom(checkmate,checkNumeric) importFrom(checkmate,checkNumeric)
importFrom(scater,counts)
importFrom(scater,newSCESet) importFrom(scater,newSCESet)
importFrom(stats,median) importFrom(stats,median)
importFrom(stats,rbinom)
importFrom(stats,rchisq)
importFrom(stats,rgamma) importFrom(stats,rgamma)
importFrom(stats,rlnorm)
importFrom(stats,rnbinom) importFrom(stats,rnbinom)
importFrom(stats,rnorm)
importFrom(stats,rpois)
importFrom(stats,runif)
#' Estimate Splatter simulation parameters #' Estimate Splatter simulation parameters
#' #'
#' Estimate simulation parameters for the Splatter simulation from a real #' Estimate simulation parameters for the Splatter simulation from a real
#' dataset. #' dataset. See the individual estimation functions for more details on how this
#' is done.
#' #'
#' @param counts either a counts matrix or an SCESet object containing count #' @param counts either a counts matrix or an SCESet object containing count
#' data to estimate parameters from. #' data to estimate parameters from.
#' @param params SplatParams object to store estimated values in. #' @param params SplatParams object to store estimated values in.
#' #'
#' @details #' @seealso
#' \code{\link{estSplatMean}}, \code{\link{estSplatLib}},
#' \code{\link{estSplatOutlier}}, \code{\link{estSplatBCV}},
#' \code{\link{estSplatDropout}}
#' #'
#' @return SplatParams object containing the estimated parameters. #' @return SplatParams object containing the estimated parameters.
#' #'
......
#' Splatter simulation
#'
#' Simulate count data from a fictional single-cell RNA-seq experiment using
#' the Splatter method.
#'
#' @param params SplatParams object containing parameters for the simulation.
#' See \code{\link{SplatParams}} for details.
#' @param method which simulation method to use. Options are "single" which
#' produces a single population, "groups" which produces distinct groups
#' (eg. cell types) or "paths" which selects cells from continuous
#' trajectories (eg. differentiation processes).
#' @param verbose logical. Whether to print progress messages.
#' @param ... any additional parameter settings to override what is provided in
#' \code{params}.
#'
#' @details
#' Parameters can be set in a variety of ways. If no parameters are provided
#' the default parameters are used. Any parameters in \code{params} can be
#' overridden by supplying additional arguments through a call to
#' \code{\link{setParams}}. This design allows the user flexibility in
#' how they supply parameters and allows small adjustments without creating a
#' new \code{SplatParams} object. See examples for a demonstration of how this
#' can be used.
#'
#' The simulation involves the following steps:
#' \enumerate{
#' \item Set up simulation object
#' \item Simulate library sizes
#' \item Simulate gene means
#' \item Simulate groups/paths
#' \item Simulate BCV adjusted cell means
#' \item Simulate true counts
#' \item Simulate dropout
#' \item Create final SCESet object
#' }
#'
#' The final output is an \code{\link[scater]{SCESet}} object that contains the
#' simulated counts but also the values for various intermediate steps. These
#' are stored in the \code{\link[Biobase]{phenoData}} (for cell specific
#' information), \code{\link[Biobase]{featureData}} (for gene specific
#' information) or \code{\link[Biobase]{assayData}} (for gene by cell matrices)
#' slots. This additional information includes:
#' \describe{
#' \item{\code{phenoData}}{
#' \describe{
#' \item{Cell}{Unique cell identifier.}
#' \item{Group}{The group or path the cell belongs to.}
#' \item{ExpLibSize}{The expected library size for that cell.}
#' \item{Step (paths only)}{how far along the path each cell is.}
#' }
#' }
#' \item{\code{featureData}}{
#' \describe{
#' \item{Gene}{Unique gene identifier.}
#' \item{BaseGeneMean}{The base expression level for that gene.}
#' \item{OutlierFactor}{Expression outlier factor for that gene. Values
#' of 1 indicate the gene is not an expression outlier.}
#' \item{GeneMean}{Expression level after applying outlier factors.}
#' \item{DEFac[Group]}{The differential expression factor for each gene
#' in a particular group. Values of 1 indicate the gene is not
#' differentially expressed.}
#' \item{GeneMean[Group]}{Expression level of a gene in a particular
#' group after applying differential expression factors.}
#' }
#' }
#' \item{\code{assayData}}{
#' \describe{
#' \item{BaseCellMeans}{The expression of genes in each cell adjusted for
#' expected library size.}
#' \item{BCV}{The Biological Coefficient of Variation for each gene in
#' each cell.}
#' \item{CellMeans}{The expression level of genes in each cell adjusted
#' for BCV.}
#' \item{TrueCounts}{The simulated counts before dropout.}
#' \item{Dropout}{Logical matrix showing which values have been dropped
#' in which cells.}
#' }
#' }
#' }
#'
#' Values that have been added by Splatter are named using \code{CamelCase} in
#' order to differentiate them from the values added by Scater which uses
#' \code{underscore_naming}.
#'
#' @return SCESet object containing the simulated counts and intermediate
#' values.
#'
#' @seealso
#' \code{\link{splatSimLibSizes}}, \code{\link{splatSimGeneMeans}},
#' \code{\link{splatSimDE}}, \code{\link{splatSimCellMeans}},
#' \code{\link{splatSimBCVMeans}}, \code{\link{splatSimTrueCounts}},
#' \code{\link{splatSimDropout}}
#'
#' @examples
#' # Simulation with default parameters
#' sim <- splatSimulate()
#' # Simulation with different number of genes
#' sim <- splatSimulate(nGenes = 1000)
#' # Simulation with custom parameters
#' params <- newSplatParams(nGenes = 100, mean.rate = 0.5)
#' sim <- splatSimulate(params)
#' # Simulation with adjusted custom parameters
#' sim <- splatSimulate(params, mean.rate = 0.6, out.prob = 0.2)
#' # Simulate groups
#' sim <- splatSimulate(method = "groups")
#' # Simulate paths
#' sim <- splatSimulate(method = "paths")
#' @importFrom Biobase fData pData pData<- assayData
#' @importFrom scater newSCESet counts
#' @export
splatSimulate <- function(params = newSplatParams(),
method = c("single", "groups", "paths"),
verbose = TRUE, ...) {
method <- match.arg(method)
if (verbose) {message("Getting parameters...")}
params <- setParams(params, ...)
params <- expandParams(params)
validObject(params)
# Set random seed
seed <- getParam(params, "seed")
set.seed(seed)
# Get the parameters we are going to use
nCells <- getParam(params, "nCells")
nGenes <- getParam(params, "nGenes")
nGroups <- getParam(params, "nGroups")
group.cells <- getParam(params, "groupCells")
if (nGroups == 1 && method == "groups") {
warning("nGroups is 1, switching to single mode")
method <- "single"
}
if (verbose) {message("Creating simulation object...")}
# Set up name vectors
cell.names <- paste0("Cell", 1:nCells)
gene.names <- paste0("Gene", 1:nGenes)
if (method == "groups") {
group.names <- paste0("Group", 1:nGroups)
} else if (method == "paths") {
group.names <- paste0("Path", 1:nGroups)
}
# Create SCESet with dummy counts to store simulation
dummy.counts <- matrix(1, ncol = nCells, nrow = nGenes)
rownames(dummy.counts) <- gene.names
colnames(dummy.counts) <- cell.names
phenos <- new("AnnotatedDataFrame", data = data.frame(Cell = cell.names))
rownames(phenos) <- cell.names
features <- new("AnnotatedDataFrame", data = data.frame(Gene = gene.names))
rownames(features) <- gene.names
sim <- newSCESet(countData = dummy.counts, phenoData = phenos,
featureData = features)
# Make groups vector which is the index of param$groupCells repeated
# params$groupCells[index] times
if (method != "single") {
groups <- lapply(1:nGroups, function(i, g) {rep(i, g[i])},
g = group.cells)
groups <- unlist(groups)
pData(sim)$Group <- group.names[groups]
}
if (verbose) {message("Simulating library sizes...")}
sim <- splatSimLibSizes(sim, params)
if (verbose) {message("Simulating gene means...")}
sim <- splatSimGeneMeans(sim, params)
if (method == "single") {
sim <- splatSimSingleCellMeans(sim, params)
} else if (method == "groups") {
if (verbose) {message("Simulating group DE...")}
sim <- splatSimGroupDE(sim, params)
if (verbose) {message("Simulating cell means...")}
sim <- splatSimGroupCellMeans(sim, params)
} else {
if (verbose) {message("Simulating path endpoints...")}
sim <- splatSimPathDE(sim, params)
if (verbose) {message("Simulating path steps...")}
sim <- splatSimPathCellMeans(sim, params)
}
if (verbose) {message("Simulating BCV...")}
sim <- splatSimBCVMeans(sim, params)
if (verbose) {message("Simulating counts..")}
sim <- splatSimTrueCounts(sim, params)
if (verbose) {message("Simulating dropout...")}
sim <- splatSimDropout(sim, params)
if (verbose) {message("Creating final SCESet...")}
# Create new SCESet to make sure values are calculated correctly
sce <- newSCESet(countData = counts(sim),
phenoData = new("AnnotatedDataFrame", data = pData(sim)),
featureData = new("AnnotatedDataFrame", data = fData(sim)))
# Add intermediate matrices stored in assayData
for (assay.name in names(assayData(sim))) {
if (!(assay.name %in% names(assayData(sce)))) {
assayData(sce)[[assay.name]] <- assayData(sim)[[assay.name]]
}
}
if (verbose) {message("Done!")}
return(sce)
}
#' @rdname splatSimulate
#' @export
splatSimulateSingle <- function(params = newSplatParams(),
verbose = TRUE, ...) {
sim <- splatSimulate(params = params, method = "single",
verbose = verbose, ...)
return(sim)
}
#' @rdname splatSimulate
#' @export
splatSimulateGroups <- function(params = newSplatParams(),
verbose = TRUE, ...) {
sim <- splatSimulate(params = params, method = "groups",
verbose = verbose, ...)
return(sim)
}
#' @rdname splatSimulate
#' @export
splatSimulatePaths <- function(params = newSplatParams(), verbose = TRUE, ...) {
sim <- splatSimulate(params = params, method = "paths",
verbose = verbose, ...)
return(sim)
}
#' Simulate library sizes
#'
#' Simulate expected library sizes from a log-normal distribution
#'
#' @param sim SCESet to add library size to.
#' @param params SplatParams object with simulation parameters.
#'
#' @return SCESet with simulated library sizes.
#'
#' @importFrom Biobase pData pData<-
#' @importFrom stats rlnorm
splatSimLibSizes <- function(sim, params) {
nCells <- getParam(params, "nCells")
lib.loc <- getParam(params, "lib.loc")
lib.scale <- getParam(params, "lib.scale")
exp.lib.sizes <- rlnorm(nCells, lib.loc, lib.scale)
pData(sim)$ExpLibSize <- exp.lib.sizes
return(sim)
}
#' Simulate gene means
#'
#' Simulate gene means from a gamma distribution. Also simulates outlier
#' expression factors. Genes with an outlier factor not equal to 1 are replaced
#' with the median mean expression multiplied by the outlier factor.
#'
#' @param sim SCESet to add gene means to.
#' @param params SplatParams object with simulation parameters.
#'
#' @return SCESet with simulated gene means.
#'
#' @importFrom Biobase fData fData<-
#' @importFrom stats rgamma median
splatSimGeneMeans <- function(sim, params) {
nGenes <- getParam(params, "nGenes")
mean.shape <- getParam(params, "mean.shape")
mean.rate <- getParam(params, "mean.rate")
out.prob <- getParam(params, "out.prob")
out.loProb <- getParam(params, "out.loProb")
out.facLoc <- getParam(params, "out.facLoc")
out.facScale <- getParam(params, "out.facScale")
# Simulate base gene means
base.means.gene <- rgamma(nGenes, shape = mean.shape, rate = mean.rate)
# Add expression outliers
outlier.facs <- getLNormFactors(nGenes, out.prob, out.loProb, out.facLoc,
out.facScale)
median.means.gene <- median(base.means.gene)
outlier.means <- median.means.gene * outlier.facs
is.outlier <- outlier.facs != 1
means.gene <- base.means.gene
means.gene[is.outlier] <- outlier.means[is.outlier]
fData(sim)$BaseGeneMean <- base.means.gene
fData(sim)$OutlierFactor <- outlier.facs
fData(sim)$GeneMean <- means.gene
return(sim)
}
#' Simulate group differential expression
#'
#' Simulate differential expression. Differential expression factors for each
#' group are produced using \code{\link{getLNormFactors}} and these are added
#' along with updated means for each group. For paths care is taked to make sure
#' they are simualated in the correct order.
#'
#' @param sim SCESet to add differential expression to.
#' @param params splatParams object with simulation parameters.
#'
#' @return SCESet with simulated differential expression.
#'
#' @name splatSimDE
NULL
#' @rdname splatSimDE
#' @importFrom Biobase fData
splatSimGroupDE <- function(sim, params) {
nGenes <- getParam(params, "nGenes")
nGroups <- getParam(params, "nGroups")
de.prob <- getParam(params, "de.prob")
de.downProb <- getParam(params, "de.downProb")
de.facLoc <- getParam(params, "de.facLoc")
de.facScale <- getParam(params, "de.facScale")
means.gene <- fData(sim)$GeneMean
for (idx in 1:nGroups) {
de.facs <- getLNormFactors(nGenes, de.prob[idx], de.downProb[idx],
de.facLoc[idx], de.facScale[idx])
group.means.gene <- means.gene * de.facs
fData(sim)[[paste0("DEFacGroup", idx)]] <- de.facs
fData(sim)[[paste0("GeneMeanGroup", idx)]] <- group.means.gene
}
return(sim)
}
#' @rdname splatSimDE
#' @importFrom Biobase fData
splatSimPathDE <- function(sim, params) {
nGenes <- getParam(params, "nGenes")
de.prob <- getParam(params, "de.prob")
de.downProb <- getParam(params, "de.downProb")
de.facLoc <- getParam(params, "de.facLoc")
de.facScale <- getParam(params, "de.facScale")
path.from <- getParam(params, "path.from")
path.order <- getPathOrder(path.from)
for (path in path.order) {
from <- path.from[path]
if (from == 0) {
means.gene <- fData(sim)$GeneMean
} else {
means.gene <- fData(sim)[[paste0("GeneMeanPath", from)]]
}
de.facs <- getLNormFactors(nGenes, de.prob[path], de.downProb[path],
de.facLoc[path], de.facScale[path])
path.means.gene <- means.gene * de.facs
fData(sim)[[paste0("DEFacPath", path)]] <- de.facs
fData(sim)[[paste0("GeneMeanPath", path)]] <- path.means.gene
}
return(sim)
}
#' Simulate cell means
#'
#' Simulate a gene by cell matrix giving the mean expression for each gene in
#' each cell. Cells start with the mean expression for the group they belong to
#' (when simulating groups) or cells are assigned the mean expression from a
#' random position on the appropriate path (when simulating paths). The selected
#' means are adjusted for each cell's expected library size.
#'
#' @param sim SCESet to add cell means to.
#' @param params SplatParams object with simulation parameters.
#'
#' @return SCESet with added cell means.
#'
#' @name splatSimCellMeans
NULL
#' @rdname splatSimCellMeans
#' @importFrom Biobase fData pData assayData assayData<-
splatSimSingleCellMeans <- function(sim, params) {
nCells <- getParam(params, "nCells")
cell.names <- pData(sim)$Cell
gene.names <- fData(sim)$Gene
exp.lib.sizes <- pData(sim)$ExpLibSize
cell.means.gene <- as.matrix(fData(sim)[, rep("GeneMean", nCells)])
cell.props.gene <- t(t(cell.means.gene) / colSums(cell.means.gene))
base.means.cell <- t(t(cell.props.gene) * exp.lib.sizes)
colnames(base.means.cell) <- cell.names
rownames(base.means.cell) <- gene.names
assayData(sim)$BaseCellMeans <- base.means.cell
return(sim)
}
#' @rdname splatSimCellMeans
#' @importFrom Biobase fData pData assayData assayData<-
splatSimGroupCellMeans <- function(sim, params) {
nGroups <- getParam(params, "nGroups")
cell.names <- pData(sim)$Cell
gene.names <- fData(sim)$Gene
groups <- pData(sim)$Group
group.names <- unique(groups)
exp.lib.sizes <- pData(sim)$ExpLibSize
group.means.gene <- fData(sim)[, paste0("GeneMean", group.names)]
cell.means.gene <- as.matrix(group.means.gene[, factor(groups)])
cell.props.gene <- t(t(cell.means.gene) / colSums(cell.means.gene))
base.means.cell <- t(t(cell.props.gene) * exp.lib.sizes)
colnames(base.means.cell) <- cell.names
rownames(base.means.cell) <- gene.names
assayData(sim)$BaseCellMeans <- base.means.cell
return(sim)
}
#' @rdname splatSimCellMeans
#' @importFrom Biobase fData pData assayData
#' @importFrom stats rbinom
splatSimPathCellMeans <- function(sim, params) {
nGenes <- getParam(params, "nGenes")
nGroups <- getParam(params, "nGroups")
group.cells <- getParam(params, "groupCells")
path.from <- getParam(params, "path.from")
path.length <- getParam(params, "path.length")
path.skew <- getParam(params, "path.skew")
path.nonlinearProb <- getParam(params, "path.nonlinearProb")
path.sigmaFac <- getParam(params, "path.sigmaFac")
cell.names <- pData(sim)$Cell
gene.names <- fData(sim)$Gene
groups <- pData(sim)$Group
group.names <- unique(groups)
exp.lib.sizes <- pData(sim)$ExpLibSize
# Generate paths. Each path is a matrix with path.length columns and
# nGenes rows where the expression from each genes changes along the path.
path.steps <- lapply(seq_along(path.from), function(idx) {
from <- path.from[idx]
# Find the means at the starting position
if (from == 0) {
means.start <- fData(sim)$GeneMean
} else {
means.start <- fData(sim)[[paste0("GeneMeanPath", from)]]
}
# Find the means at the end position
means.end <- fData(sim)[[paste0("GeneMeanPath", idx)]]
# Select genes to follow a non-linear path
is.nonlinear <- as.logical(rbinom(nGenes, 1, path.nonlinearProb))
sigma.facs <- rep(0, nGenes)
sigma.facs[is.nonlinear] <- path.sigmaFac
# Build Brownian bridges from start to end
steps <- buildBridges(means.start, means.end, n = path.length[idx],
sigma.fac = sigma.facs)
fData(sim)[[paste0("SigmaFacPath", idx)]] <- sigma.facs
return(t(steps))
})
# Randomly assign a position in the appropriate path to each cell
cell.steps <- lapply(1:nGroups, function(idx) {
path.probs <- seq(path.skew[idx], 1 - path.skew[idx],
length = path.length[idx])
path.probs <- path.probs / sum(path.probs)
steps <- sort(sample(1:path.length[idx], group.cells[idx],
prob = path.probs, replace = TRUE))
return(steps)
})
# Collect the underlying expression levels for each cell
cell.means.gene <- lapply(1:nGroups, function(idx) {
cell.means <- path.steps[[idx]][, cell.steps[[idx]]]
return(cell.means)
})
cell.means.gene <- do.call(cbind, cell.means.gene)
# Adjust expression based on library size
cell.props.gene <- t(t(cell.means.gene) / colSums(cell.means.gene))
base.means.cell <- t(t(cell.props.gene) * exp.lib.sizes)
colnames(base.means.cell) <- cell.names
rownames(base.means.cell) <- gene.names
pData(sim)$Step <- unlist(cell.steps)
assayData(sim)$BaseCellMeans <- base.means.cell
return(sim)
}
#' Simulate BCV means
#'
#' Simulate means for each gene in each cell that are adjusted to follow a
#' mean-variance trend using Biological Coefficient of Variation taken from
#' and inverse gamma distribution.
#'
#' @param sim SCESet to add BCV means to.
#' @param params SplatParams object with simulation parameters.
#'
#' @return SCESet with simulated BCV means.
#'
#' @importFrom Biobase fData pData assayData assayData<-
#' @importFrom stats rchisq rgamma
splatSimBCVMeans <- function(sim, params) {
nGenes <- getParam(params, "nGenes")
nCells <- getParam(params, "nCells")
bcv.common <- getParam(params, "bcv.common")
bcv.df <- getParam(params, "bcv.df")
cell.names <- pData(sim)$Cell
gene.names <- fData(sim)$Gene
base.means.cell <- assayData(sim)$BaseCellMeans
bcv <- (bcv.common + (1 / sqrt(base.means.cell))) *
sqrt(bcv.df / rchisq(nGenes, df = bcv.df))
means.cell <- matrix(rgamma(nGenes * nCells, shape = 1 / (bcv ^ 2),
scale = base.means.cell * (bcv ^ 2)),
nrow = nGenes, ncol = nCells)
colnames(bcv) <- cell.names
rownames(bcv) <- gene.names
colnames(means.cell) <- cell.names
rownames(means.cell) <- gene.names
assayData(sim)$BCV <- bcv
assayData(sim)$CellMeans <- means.cell
return(sim)
}
#' Simulate true counts
#'
#' Simulate a true counts matrix. Counts are simulated from a poisson
#' distribution where Each gene in each cell has it's own mean based on the
#' group (or path position), expected library size and BCV.
#'
#' @param sim SCESet to add true counts to.
#' @param params SplatParams object with simulation parameters.
#'
#' @return SCESet with simulated true counts.
#'
#' @importFrom Biobase fData pData assayData
#' @importFrom stats rpois
splatSimTrueCounts <- function(sim, params) {
nGenes <- getParam(params, "nGenes")
nCells <- getParam(params, "nCells")
cell.names <- pData(sim)$Cell
gene.names <- fData(sim)$Gene
cell.means <- assayData(sim)$CellMeans
true.counts <- matrix(rpois(nGenes * nCells, lambda = cell.means),
nrow = nGenes, ncol = nCells)
colnames(true.counts) <- cell.names
rownames(true.counts) <- gene.names
assayData(sim)$TrueCounts <- true.counts
return(sim)
}
#' Simulate dropout
#'
#' A logistic function is used to form a relationshop between the expression
#' level of a gene and the probability of dropout, giving a probability for each
#' gene in each cell. These probabilities are used in a Bernoulli distribution
#' to decide which counts should be dropped.
#'
#' @param sim SCESet to add dropout to.
#' @param params SplatParams object with simulation parameters.
#'
#' @return SCESet with simulated dropout and observed counts.
#'
#' @importFrom Biobase fData pData assayData assayData<-
#' @importFrom stats rbinom
splatSimDropout <- function(sim, params) {
dropout.present <- getParam(params, "dropout.present")
true.counts <- assayData(sim)$TrueCounts
if (dropout.present) {
nCells <- getParam(params, "nCells")
nGenes <- getParam(params, "nGenes")
dropout.mid <- getParam(params, "dropout.mid")
dropout.shape <- getParam(params, "dropout.shape")
cell.names <- pData(sim)$Cell
gene.names <- fData(sim)$Gene
cell.means <- assayData(sim)$CellMeans
# Generate probabilites based on expression
lib.sizes <- colSums(true.counts)
cell.facs <- log(lib.sizes) / median(lib.sizes)
drop.prob <- sapply(1:nCells, function(idx) {
eta <- cell.facs[idx] * (log(cell.means[, idx]))
return(logistic(eta, x0 = dropout.mid, k = dropout.shape))
})
# Decide which counts to keep
keep <- matrix(rbinom(nCells * nGenes, 1, 1 - drop.prob),
nrow = nGenes, ncol = nCells)
counts <- true.counts * keep
colnames(drop.prob) <- cell.names
rownames(drop.prob) <- gene.names
colnames(keep) <- cell.names
rownames(keep) <- gene.names
assayData(sim)$DropProb <- drop.prob
assayData(sim)$Dropout <- !keep
} else {
counts <- true.counts
}
scater::counts(sim) <- counts
return(sim)
}
#' Get log-normal factors
#'
#' Randomly generate multiplication factors from a log-normal distribution.
#'
#' @param n.facs Number of factors to generate.
#' @param sel.prob Probability that a factor will be selected to be different
#' from 1.
#' @param neg.prob Probability that a selected factor is less than one.
#' @param fac.loc Location parameter for the log-normal distribution.
#' @param fac.scale Scale factor for the log-normal distribution.
#'
#' @return Vector containing generated factors.
#'
#' @importFrom stats rbinom rlnorm
getLNormFactors <- function(n.facs, sel.prob, neg.prob, fac.loc, fac.scale) {
is.selected <- as.logical(rbinom(n.facs, 1, sel.prob))
n.selected <- sum(is.selected)
dir.selected <- (-1) ^ rbinom(n.selected, 1, neg.prob)
facs.selected <- rlnorm(n.selected, fac.loc, fac.scale)
# Reverse directions for factors that are less than one
dir.selected[facs.selected < 1 & dir.selected == -1] <- 1
dir.selected[facs.selected < 1 & dir.selected == 1] <- -1
factors <- rep(1, n.facs)
factors[is.selected] <- facs.selected ^ dir.selected
return(factors)
}
#' Get path order
#'
#' Identify the correct order to process paths so that preceding paths have
#' already been simulated.
#'
#' @param path.from vector giving the path endpoints that each path orginates
#' from.
#'
#' @return Vector giving the order to process paths in.
getPathOrder <- function(path.from) {
# Transform the vector into a list of (from, to) pairs
path.pairs <- list()
for (idx in 1:length(path.from)) {
path.pairs[[idx]] <- c(path.from[idx], idx)
}
# Determine the processing order
# If a path is in the "done" vector any path originating here can be
# completed
done <- 0
while (length(path.pairs) > 0) {
path.pair <- path.pairs[[1]]
path.pairs <- path.pairs[-1]
from <- path.pair[1]
to <- path.pair[2]
if (from %in% done) {
done <- c(done, to)
} else {
path.pairs <- c(path.pairs, list(path.pair))
}
}
# Remove the origin from the vector
done <- done[-1]
return(done)
}
#' Brownian bridge
#'
#' Calculate a smoothed Brownian bridge between two points. A Brownian bridge is
#' a random walk with fixed end points.
#'
#' @param x starting value.
#' @param y end value.
#' @param N number of steps in random walk.
#' @param n number of points in smoothed bridge.
#' @param sigma.fac multiplier specifying how extreme each step can be.
#'
#' @return Vector of length n following a path from x to y.
#'
#' @importFrom stats runif rnorm
bridge <- function (x = 0, y = 0, N = 5, n = 100, sigma.fac = 0.8) {
dt <- 1 / (N - 1)
t <- seq(0, 1, length = N)
sigma2 <- runif(1, 0, sigma.fac * mean(c(x, y)))
X <- c(0, cumsum(rnorm(N - 1, sd = sigma2) * sqrt(dt)))
BB <- x + X - t * (X[N] - y + x)
BB <- akima::aspline(BB, n = n)$y
BB[BB < 0] <- 0
return(BB)
}
buildBridges <- Vectorize(bridge, vectorize.args = c("x", "y", "sigma.fac"))
\ No newline at end of file
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/splat-simulate.R
\name{bridge}
\alias{bridge}
\title{Brownian bridge}
\usage{
bridge(x = 0, y = 0, N = 5, n = 100, sigma.fac = 0.8)
}
\arguments{
\item{x}{starting value.}
\item{y}{end value.}
\item{N}{number of steps in random walk.}
\item{n}{number of points in smoothed bridge.}
\item{sigma.fac}{multiplier specifying how extreme each step can be.}
}
\value{
Vector of length n following a path from x to y.
}
\description{
Calculate a smoothed Brownian bridge between two points. A Brownian bridge is
a random walk with fixed end points.
}
% Generated by roxygen2: do not edit by hand % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/splat-estimate.R % Please edit documentation in R/splat-estimate.R
\name{estSplatMeans} \name{estSplatMean}
\alias{estSplatMeans} \alias{estSplatMean}
\title{Estimate Splatter mean parameters} \title{Estimate Splatter mean parameters}
\usage{ \usage{
estSplatMeans(norm.counts, params) estSplatMean(norm.counts, params)
} }
\arguments{ \arguments{
\item{norm.counts}{library size normalised counts matrix.} \item{norm.counts}{library size normalised counts matrix.}
......
...@@ -6,15 +6,15 @@ ...@@ -6,15 +6,15 @@
\alias{estimateSimpleParams.matrix} \alias{estimateSimpleParams.matrix}
\title{Estimate simple simulation parameters} \title{Estimate simple simulation parameters}
\usage{ \usage{
estimateSimpleParams(data, params = newSimpleParams()) estimateSimpleParams(counts, params = newSimpleParams())
\method{estimateSimpleParams}{SCESet}(data, params = newSimpleParams()) \method{estimateSimpleParams}{SCESet}(counts, params = newSimpleParams())
\method{estimateSimpleParams}{matrix}(data, params = newSimpleParams()) \method{estimateSimpleParams}{matrix}(counts, params = newSimpleParams())
} }
\arguments{ \arguments{
\item{data}{either a counts matrix or an SCESet object containing count data \item{counts}{either a counts matrix or an SCESet object containing count
to estimate parameters from.} data to estimate parameters from.}
\item{params}{SimpleParams object to store estimated values in.} \item{params}{SimpleParams object to store estimated values in.}
} }
......
...@@ -6,15 +6,15 @@ ...@@ -6,15 +6,15 @@
\alias{estimateSplatParams.matrix} \alias{estimateSplatParams.matrix}
\title{Estimate Splatter simulation parameters} \title{Estimate Splatter simulation parameters}
\usage{ \usage{
estimateSplatParams(data, params = newSplatParams()) estimateSplatParams(counts, params = newSplatParams())
\method{estimateSplatParams}{SCESet}(data, params = newSplatParams()) \method{estimateSplatParams}{SCESet}(counts, params = newSplatParams())
\method{estimateSplatParams}{matrix}(data, params = newSplatParams()) \method{estimateSplatParams}{matrix}(counts, params = newSplatParams())
} }
\arguments{ \arguments{
\item{data}{either a counts matrix or an SCESet object containing count data \item{counts}{either a counts matrix or an SCESet object containing count
to estimate parameters from.} data to estimate parameters from.}
\item{params}{SplatParams object to store estimated values in.} \item{params}{SplatParams object to store estimated values in.}
} }
...@@ -23,14 +23,17 @@ SplatParams object containing the estimated parameters. ...@@ -23,14 +23,17 @@ SplatParams object containing the estimated parameters.
} }
\description{ \description{
Estimate simulation parameters for the Splatter simulation from a real Estimate simulation parameters for the Splatter simulation from a real
dataset. dataset. See the individual estimation functions for more details on how this
} is done.
\details{
} }
\examples{ \examples{
data("sc_example_counts") data("sc_example_counts")
params <- estimateSplatParams(sc_example_counts) params <- estimateSplatParams(sc_example_counts)
params params
} }
\seealso{
\code{\link{estSplatMean}}, \code{\link{estSplatLib}},
\code{\link{estSplatOutlier}}, \code{\link{estSplatBCV}},
\code{\link{estSplatDropout}}
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/splat-simulate.R
\name{getLNormFactors}
\alias{getLNormFactors}
\title{Get log-normal factors}
\usage{
getLNormFactors(n.facs, sel.prob, neg.prob, fac.loc, fac.scale)
}
\arguments{
\item{n.facs}{Number of factors to generate.}
\item{sel.prob}{Probability that a factor will be selected to be different
from 1.}
\item{neg.prob}{Probability that a selected factor is less than one.}
\item{fac.loc}{Location parameter for the log-normal distribution.}
\item{fac.scale}{Scale factor for the log-normal distribution.}
}
\value{
Vector containing generated factors.
}
\description{
Randomly generate multiplication factors from a log-normal distribution.
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/splat-simulate.R
\name{getPathOrder}
\alias{getPathOrder}
\title{Get path order}
\usage{
getPathOrder(path.from)
}
\arguments{
\item{path.from}{vector giving the path endpoints that each path orginates
from.}
}
\value{
Vector giving the order to process paths in.
}
\description{
Identify the correct order to process paths so that preceding paths have
already been simulated.
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/splat-simulate.R
\name{splatSimBCVMeans}
\alias{splatSimBCVMeans}
\title{Simulate BCV means}
\usage{
splatSimBCVMeans(sim, params)
}
\arguments{
\item{sim}{SCESet to add BCV means to.}
\item{params}{SplatParams object with simulation parameters.}
}
\value{
SCESet with simulated BCV means.
}
\description{
Simulate means for each gene in each cell that are adjusted to follow a
mean-variance trend using Biological Coefficient of Variation taken from
and inverse gamma distribution.
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/splat-simulate.R
\name{splatSimCellMeans}
\alias{splatSimCellMeans}
\alias{splatSimGroupCellMeans}
\alias{splatSimPathCellMeans}
\alias{splatSimSingleCellMeans}
\title{Simulate cell means}
\usage{
splatSimSingleCellMeans(sim, params)
splatSimGroupCellMeans(sim, params)
splatSimPathCellMeans(sim, params)
}
\arguments{
\item{sim}{SCESet to add cell means to.}
\item{params}{SplatParams object with simulation parameters.}
}
\value{
SCESet with added cell means.
}
\description{
Simulate a gene by cell matrix giving the mean expression for each gene in
each cell. Cells start with the mean expression for the group they belong to
(when simulating groups) or cells are assigned the mean expression from a
random position on the appropriate path (when simulating paths). The selected
means are adjusted for each cell's expected library size.
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/splat-simulate.R
\name{splatSimDE}
\alias{splatSimDE}
\alias{splatSimGroupDE}
\alias{splatSimPathDE}
\title{Simulate group differential expression}
\usage{
splatSimGroupDE(sim, params)
splatSimPathDE(sim, params)
}
\arguments{
\item{sim}{SCESet to add differential expression to.}
\item{params}{splatParams object with simulation parameters.}
}
\value{
SCESet with simulated differential expression.
}
\description{
Simulate differential expression. Differential expression factors for each
group are produced using \code{\link{getLNormFactors}} and these are added
along with updated means for each group. For paths care is taked to make sure
they are simualated in the correct order.
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/splat-simulate.R
\name{splatSimDropout}
\alias{splatSimDropout}
\title{Simulate dropout}
\usage{
splatSimDropout(sim, params)
}
\arguments{
\item{sim}{SCESet to add dropout to.}
\item{params}{SplatParams object with simulation parameters.}
}
\value{
SCESet with simulated dropout and observed counts.
}
\description{
A logistic function is used to form a relationshop between the expression
level of a gene and the probability of dropout, giving a probability for each
gene in each cell. These probabilities are used in a Bernoulli distribution
to decide which counts should be dropped.
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/splat-simulate.R
\name{splatSimGeneMeans}
\alias{splatSimGeneMeans}
\title{Simulate gene means}
\usage{
splatSimGeneMeans(sim, params)
}
\arguments{
\item{sim}{SCESet to add gene means to.}
\item{params}{SplatParams object with simulation parameters.}
}
\value{
SCESet with simulated gene means.
}
\description{
Simulate gene means from a gamma distribution. Also simulates outlier
expression factors. Genes with an outlier factor not equal to 1 are replaced
with the median mean expression multiplied by the outlier factor.
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/splat-simulate.R
\name{splatSimLibSizes}
\alias{splatSimLibSizes}
\title{Simulate library sizes}
\usage{
splatSimLibSizes(sim, params)
}
\arguments{
\item{sim}{SCESet to add library size to.}
\item{params}{SplatParams object with simulation parameters.}
}
\value{
SCESet with simulated library sizes.
}
\description{
Simulate expected library sizes from a log-normal distribution
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/splat-simulate.R
\name{splatSimTrueCounts}
\alias{splatSimTrueCounts}
\title{Simulate true counts}
\usage{
splatSimTrueCounts(sim, params)
}
\arguments{
\item{sim}{SCESet to add true counts to.}
\item{params}{SplatParams object with simulation parameters.}
}
\value{
SCESet with simulated true counts.
}
\description{
Simulate a true counts matrix. Counts are simulated from a poisson
distribution where Each gene in each cell has it's own mean based on the
group (or path position), expected library size and BCV.
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/splat-simulate.R
\name{splatSimulate}
\alias{splatSimulate}
\alias{splatSimulateGroups}
\alias{splatSimulatePaths}
\alias{splatSimulateSingle}
\title{Splatter simulation}
\usage{
splatSimulate(params = newSplatParams(), method = c("single", "groups",
"paths"), verbose = TRUE, ...)
splatSimulateSingle(params = newSplatParams(), verbose = TRUE, ...)
splatSimulateGroups(params = newSplatParams(), verbose = TRUE, ...)
splatSimulatePaths(params = newSplatParams(), verbose = TRUE, ...)
}
\arguments{
\item{params}{SplatParams object containing parameters for the simulation.
See \code{\link{SplatParams}} for details.}
\item{method}{which simulation method to use. Options are "single" which
produces a single population, "groups" which produces distinct groups
(eg. cell types) or "paths" which selects cells from continuous
trajectories (eg. differentiation processes).}
\item{verbose}{logical. Whether to print progress messages.}
\item{...}{any additional parameter settings to override what is provided in
\code{params}.}
}
\value{
SCESet object containing the simulated counts and intermediate
values.
}
\description{
Simulate count data from a fictional single-cell RNA-seq experiment using
the Splatter method.
}
\details{
Parameters can be set in a variety of ways. If no parameters are provided
the default parameters are used. Any parameters in \code{params} can be
overridden by supplying additional arguments through a call to
\code{\link{setParams}}. This design allows the user flexibility in
how they supply parameters and allows small adjustments without creating a
new \code{SplatParams} object. See examples for a demonstration of how this
can be used.
The simulation involves the following steps:
\enumerate{
\item Set up simulation object
\item Simulate library sizes
\item Simulate gene means
\item Simulate groups/paths
\item Simulate BCV adjusted cell means
\item Simulate true counts
\item Simulate dropout
\item Create final SCESet object
}
The final output is an \code{\link[scater]{SCESet}} object that contains the
simulated counts but also the values for various intermediate steps. These
are stored in the \code{\link[Biobase]{phenoData}} (for cell specific
information), \code{\link[Biobase]{featureData}} (for gene specific
information) or \code{\link[Biobase]{assayData}} (for gene by cell matrices)
slots. This additional information includes:
\describe{
\item{\code{phenoData}}{
\describe{
\item{Cell}{Unique cell identifier.}
\item{Group}{The group or path the cell belongs to.}
\item{ExpLibSize}{The expected library size for that cell.}
\item{Step (paths only)}{how far along the path each cell is.}
}
}
\item{\code{featureData}}{
\describe{
\item{Gene}{Unique gene identifier.}
\item{BaseGeneMean}{The base expression level for that gene.}
\item{OutlierFactor}{Expression outlier factor for that gene. Values
of 1 indicate the gene is not an expression outlier.}
\item{GeneMean}{Expression level after applying outlier factors.}
\item{DEFac[Group]}{The differential expression factor for each gene
in a particular group. Values of 1 indicate the gene is not
differentially expressed.}
\item{GeneMean[Group]}{Expression level of a gene in a particular
group after applying differential expression factors.}
}
}
\item{\code{assayData}}{
\describe{
\item{BaseCellMeans}{The expression of genes in each cell adjusted for
expected library size.}
\item{BCV}{The Biological Coefficient of Variation for each gene in
each cell.}
\item{CellMeans}{The expression level of genes in each cell adjusted
for BCV.}
\item{TrueCounts}{The simulated counts before dropout.}
\item{Dropout}{Logical matrix showing which values have been dropped
in which cells.}
}
}
}
Values that have been added by Splatter are named using \code{CamelCase} in
order to differentiate them from the values added by Scater which uses
\code{underscore_naming}.
}
\examples{
# Simulation with default parameters
sim <- splatSimulate()
# Simulation with different number of genes
sim <- splatSimulate(nGenes = 1000)
# Simulation with custom parameters
params <- newSplatParams(nGenes = 100, mean.rate = 0.5)
sim <- splatSimulate(params)
# Simulation with adjusted custom parameters
sim <- splatSimulate(params, mean.rate = 0.6, out.prob = 0.2)
# Simulate groups
sim <- splatSimulate(method = "groups")
# Simulate paths
sim <- splatSimulate(method = "paths")
}
\seealso{
\code{\link{splatSimLibSizes}}, \code{\link{splatSimGeneMeans}},
\code{\link{splatSimDE}}, \code{\link{splatSimCellMeans}},
\code{\link{splatSimBCVMeans}}, \code{\link{splatSimTrueCounts}},
\code{\link{splatSimDropout}}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment