From 16cdf173772f72e883219cca479166ff61bf7476 Mon Sep 17 00:00:00 2001 From: Luke Zappia <lazappi@users.noreply.github.com> Date: Fri, 14 Oct 2016 10:59:42 +1100 Subject: [PATCH] Modify splatSimulate to use SplatParams --- DESCRIPTION | 2 +- NAMESPACE | 17 + R/splat-estimate.R | 8 +- R/splat-simulate.R | 726 ++++++++++++++++++++++ man/bridge.Rd | 27 + man/{estSplatMeans.Rd => estSplatMean.Rd} | 6 +- man/estimateSimpleParams.Rd | 10 +- man/estimateSplatParams.Rd | 21 +- man/getLNormFactors.Rd | 27 + man/getPathOrder.Rd | 20 + man/splatSimBCVMeans.Rd | 22 + man/splatSimCellMeans.Rd | 31 + man/splatSimDE.Rd | 27 + man/splatSimDropout.Rd | 23 + man/splatSimGeneMeans.Rd | 22 + man/splatSimLibSizes.Rd | 20 + man/splatSimTrueCounts.Rd | 22 + man/splatSimulate.Rd | 131 ++++ 18 files changed, 1142 insertions(+), 20 deletions(-) create mode 100644 R/splat-simulate.R create mode 100644 man/bridge.Rd rename man/{estSplatMeans.Rd => estSplatMean.Rd} (87%) create mode 100644 man/getLNormFactors.Rd create mode 100644 man/getPathOrder.Rd create mode 100644 man/splatSimBCVMeans.Rd create mode 100644 man/splatSimCellMeans.Rd create mode 100644 man/splatSimDE.Rd create mode 100644 man/splatSimDropout.Rd create mode 100644 man/splatSimGeneMeans.Rd create mode 100644 man/splatSimLibSizes.Rd create mode 100644 man/splatSimTrueCounts.Rd create mode 100644 man/splatSimulate.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 705e045..224d4e7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: splatter Type: Package Title: Simple Simulation of Single-cell RNA Sequencing Data -Version: 0.6.7 +Version: 0.6.8 Date: 2016-10-13 Author: Luke Zappia Authors@R: as.person(c( diff --git a/NAMESPACE b/NAMESPACE index 0bdc3f5..7fc9678 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -13,14 +13,31 @@ export(newSplatParams) export(setParam) export(setParams) export(simpleSimulate) +export(splatSimulate) +export(splatSimulateGroups) +export(splatSimulatePaths) +export(splatSimulateSingle) exportClasses(SimpleParams) exportClasses(SplatParams) +importFrom(Biobase,"assayData<-") +importFrom(Biobase,"fData<-") +importFrom(Biobase,"pData<-") +importFrom(Biobase,assayData) +importFrom(Biobase,fData) +importFrom(Biobase,pData) importFrom(checkmate,checkFlag) importFrom(checkmate,checkInt) importFrom(checkmate,checkIntegerish) importFrom(checkmate,checkNumber) importFrom(checkmate,checkNumeric) +importFrom(scater,counts) importFrom(scater,newSCESet) importFrom(stats,median) +importFrom(stats,rbinom) +importFrom(stats,rchisq) importFrom(stats,rgamma) +importFrom(stats,rlnorm) importFrom(stats,rnbinom) +importFrom(stats,rnorm) +importFrom(stats,rpois) +importFrom(stats,runif) diff --git a/R/splat-estimate.R b/R/splat-estimate.R index df89d63..b4b2333 100644 --- a/R/splat-estimate.R +++ b/R/splat-estimate.R @@ -1,13 +1,17 @@ #' Estimate Splatter simulation parameters #' #' Estimate simulation parameters for the Splatter simulation from a real -#' dataset. +#' dataset. See the individual estimation functions for more details on how this +#' is done. #' #' @param counts either a counts matrix or an SCESet object containing count #' data to estimate parameters from. #' @param params SplatParams object to store estimated values in. #' -#' @details +#' @seealso +#' \code{\link{estSplatMean}}, \code{\link{estSplatLib}}, +#' \code{\link{estSplatOutlier}}, \code{\link{estSplatBCV}}, +#' \code{\link{estSplatDropout}} #' #' @return SplatParams object containing the estimated parameters. #' diff --git a/R/splat-simulate.R b/R/splat-simulate.R new file mode 100644 index 0000000..a486a05 --- /dev/null +++ b/R/splat-simulate.R @@ -0,0 +1,726 @@ +#' Splatter simulation +#' +#' Simulate count data from a fictional single-cell RNA-seq experiment using +#' the Splatter method. +#' +#' @param params SplatParams object containing parameters for the simulation. +#' See \code{\link{SplatParams}} for details. +#' @param method which simulation method to use. Options are "single" which +#' produces a single population, "groups" which produces distinct groups +#' (eg. cell types) or "paths" which selects cells from continuous +#' trajectories (eg. differentiation processes). +#' @param verbose logical. Whether to print progress messages. +#' @param ... any additional parameter settings to override what is provided in +#' \code{params}. +#' +#' @details +#' Parameters can be set in a variety of ways. If no parameters are provided +#' the default parameters are used. Any parameters in \code{params} can be +#' overridden by supplying additional arguments through a call to +#' \code{\link{setParams}}. This design allows the user flexibility in +#' how they supply parameters and allows small adjustments without creating a +#' new \code{SplatParams} object. See examples for a demonstration of how this +#' can be used. +#' +#' The simulation involves the following steps: +#' \enumerate{ +#' \item Set up simulation object +#' \item Simulate library sizes +#' \item Simulate gene means +#' \item Simulate groups/paths +#' \item Simulate BCV adjusted cell means +#' \item Simulate true counts +#' \item Simulate dropout +#' \item Create final SCESet object +#' } +#' +#' The final output is an \code{\link[scater]{SCESet}} object that contains the +#' simulated counts but also the values for various intermediate steps. These +#' are stored in the \code{\link[Biobase]{phenoData}} (for cell specific +#' information), \code{\link[Biobase]{featureData}} (for gene specific +#' information) or \code{\link[Biobase]{assayData}} (for gene by cell matrices) +#' slots. This additional information includes: +#' \describe{ +#' \item{\code{phenoData}}{ +#' \describe{ +#' \item{Cell}{Unique cell identifier.} +#' \item{Group}{The group or path the cell belongs to.} +#' \item{ExpLibSize}{The expected library size for that cell.} +#' \item{Step (paths only)}{how far along the path each cell is.} +#' } +#' } +#' \item{\code{featureData}}{ +#' \describe{ +#' \item{Gene}{Unique gene identifier.} +#' \item{BaseGeneMean}{The base expression level for that gene.} +#' \item{OutlierFactor}{Expression outlier factor for that gene. Values +#' of 1 indicate the gene is not an expression outlier.} +#' \item{GeneMean}{Expression level after applying outlier factors.} +#' \item{DEFac[Group]}{The differential expression factor for each gene +#' in a particular group. Values of 1 indicate the gene is not +#' differentially expressed.} +#' \item{GeneMean[Group]}{Expression level of a gene in a particular +#' group after applying differential expression factors.} +#' } +#' } +#' \item{\code{assayData}}{ +#' \describe{ +#' \item{BaseCellMeans}{The expression of genes in each cell adjusted for +#' expected library size.} +#' \item{BCV}{The Biological Coefficient of Variation for each gene in +#' each cell.} +#' \item{CellMeans}{The expression level of genes in each cell adjusted +#' for BCV.} +#' \item{TrueCounts}{The simulated counts before dropout.} +#' \item{Dropout}{Logical matrix showing which values have been dropped +#' in which cells.} +#' } +#' } +#' } +#' +#' Values that have been added by Splatter are named using \code{CamelCase} in +#' order to differentiate them from the values added by Scater which uses +#' \code{underscore_naming}. +#' +#' @return SCESet object containing the simulated counts and intermediate +#' values. +#' +#' @seealso +#' \code{\link{splatSimLibSizes}}, \code{\link{splatSimGeneMeans}}, +#' \code{\link{splatSimDE}}, \code{\link{splatSimCellMeans}}, +#' \code{\link{splatSimBCVMeans}}, \code{\link{splatSimTrueCounts}}, +#' \code{\link{splatSimDropout}} +#' +#' @examples +#' # Simulation with default parameters +#' sim <- splatSimulate() +#' # Simulation with different number of genes +#' sim <- splatSimulate(nGenes = 1000) +#' # Simulation with custom parameters +#' params <- newSplatParams(nGenes = 100, mean.rate = 0.5) +#' sim <- splatSimulate(params) +#' # Simulation with adjusted custom parameters +#' sim <- splatSimulate(params, mean.rate = 0.6, out.prob = 0.2) +#' # Simulate groups +#' sim <- splatSimulate(method = "groups") +#' # Simulate paths +#' sim <- splatSimulate(method = "paths") +#' @importFrom Biobase fData pData pData<- assayData +#' @importFrom scater newSCESet counts +#' @export +splatSimulate <- function(params = newSplatParams(), + method = c("single", "groups", "paths"), + verbose = TRUE, ...) { + + method <- match.arg(method) + + if (verbose) {message("Getting parameters...")} + params <- setParams(params, ...) + params <- expandParams(params) + validObject(params) + + # Set random seed + seed <- getParam(params, "seed") + set.seed(seed) + + # Get the parameters we are going to use + nCells <- getParam(params, "nCells") + nGenes <- getParam(params, "nGenes") + nGroups <- getParam(params, "nGroups") + group.cells <- getParam(params, "groupCells") + + if (nGroups == 1 && method == "groups") { + warning("nGroups is 1, switching to single mode") + method <- "single" + } + + if (verbose) {message("Creating simulation object...")} + # Set up name vectors + cell.names <- paste0("Cell", 1:nCells) + gene.names <- paste0("Gene", 1:nGenes) + if (method == "groups") { + group.names <- paste0("Group", 1:nGroups) + } else if (method == "paths") { + group.names <- paste0("Path", 1:nGroups) + } + + # Create SCESet with dummy counts to store simulation + dummy.counts <- matrix(1, ncol = nCells, nrow = nGenes) + rownames(dummy.counts) <- gene.names + colnames(dummy.counts) <- cell.names + phenos <- new("AnnotatedDataFrame", data = data.frame(Cell = cell.names)) + rownames(phenos) <- cell.names + features <- new("AnnotatedDataFrame", data = data.frame(Gene = gene.names)) + rownames(features) <- gene.names + sim <- newSCESet(countData = dummy.counts, phenoData = phenos, + featureData = features) + + # Make groups vector which is the index of param$groupCells repeated + # params$groupCells[index] times + if (method != "single") { + groups <- lapply(1:nGroups, function(i, g) {rep(i, g[i])}, + g = group.cells) + groups <- unlist(groups) + pData(sim)$Group <- group.names[groups] + } + + if (verbose) {message("Simulating library sizes...")} + sim <- splatSimLibSizes(sim, params) + if (verbose) {message("Simulating gene means...")} + sim <- splatSimGeneMeans(sim, params) + if (method == "single") { + sim <- splatSimSingleCellMeans(sim, params) + } else if (method == "groups") { + if (verbose) {message("Simulating group DE...")} + sim <- splatSimGroupDE(sim, params) + if (verbose) {message("Simulating cell means...")} + sim <- splatSimGroupCellMeans(sim, params) + } else { + if (verbose) {message("Simulating path endpoints...")} + sim <- splatSimPathDE(sim, params) + if (verbose) {message("Simulating path steps...")} + sim <- splatSimPathCellMeans(sim, params) + } + if (verbose) {message("Simulating BCV...")} + sim <- splatSimBCVMeans(sim, params) + if (verbose) {message("Simulating counts..")} + sim <- splatSimTrueCounts(sim, params) + if (verbose) {message("Simulating dropout...")} + sim <- splatSimDropout(sim, params) + + if (verbose) {message("Creating final SCESet...")} + # Create new SCESet to make sure values are calculated correctly + sce <- newSCESet(countData = counts(sim), + phenoData = new("AnnotatedDataFrame", data = pData(sim)), + featureData = new("AnnotatedDataFrame", data = fData(sim))) + + # Add intermediate matrices stored in assayData + for (assay.name in names(assayData(sim))) { + if (!(assay.name %in% names(assayData(sce)))) { + assayData(sce)[[assay.name]] <- assayData(sim)[[assay.name]] + } + } + + if (verbose) {message("Done!")} + return(sce) +} + +#' @rdname splatSimulate +#' @export +splatSimulateSingle <- function(params = newSplatParams(), + verbose = TRUE, ...) { + sim <- splatSimulate(params = params, method = "single", + verbose = verbose, ...) + return(sim) +} + +#' @rdname splatSimulate +#' @export +splatSimulateGroups <- function(params = newSplatParams(), + verbose = TRUE, ...) { + sim <- splatSimulate(params = params, method = "groups", + verbose = verbose, ...) + return(sim) +} + +#' @rdname splatSimulate +#' @export +splatSimulatePaths <- function(params = newSplatParams(), verbose = TRUE, ...) { + sim <- splatSimulate(params = params, method = "paths", + verbose = verbose, ...) + return(sim) +} + +#' Simulate library sizes +#' +#' Simulate expected library sizes from a log-normal distribution +#' +#' @param sim SCESet to add library size to. +#' @param params SplatParams object with simulation parameters. +#' +#' @return SCESet with simulated library sizes. +#' +#' @importFrom Biobase pData pData<- +#' @importFrom stats rlnorm +splatSimLibSizes <- function(sim, params) { + + nCells <- getParam(params, "nCells") + lib.loc <- getParam(params, "lib.loc") + lib.scale <- getParam(params, "lib.scale") + + exp.lib.sizes <- rlnorm(nCells, lib.loc, lib.scale) + pData(sim)$ExpLibSize <- exp.lib.sizes + + return(sim) +} + +#' Simulate gene means +#' +#' Simulate gene means from a gamma distribution. Also simulates outlier +#' expression factors. Genes with an outlier factor not equal to 1 are replaced +#' with the median mean expression multiplied by the outlier factor. +#' +#' @param sim SCESet to add gene means to. +#' @param params SplatParams object with simulation parameters. +#' +#' @return SCESet with simulated gene means. +#' +#' @importFrom Biobase fData fData<- +#' @importFrom stats rgamma median +splatSimGeneMeans <- function(sim, params) { + + nGenes <- getParam(params, "nGenes") + mean.shape <- getParam(params, "mean.shape") + mean.rate <- getParam(params, "mean.rate") + out.prob <- getParam(params, "out.prob") + out.loProb <- getParam(params, "out.loProb") + out.facLoc <- getParam(params, "out.facLoc") + out.facScale <- getParam(params, "out.facScale") + + # Simulate base gene means + base.means.gene <- rgamma(nGenes, shape = mean.shape, rate = mean.rate) + + # Add expression outliers + outlier.facs <- getLNormFactors(nGenes, out.prob, out.loProb, out.facLoc, + out.facScale) + median.means.gene <- median(base.means.gene) + outlier.means <- median.means.gene * outlier.facs + is.outlier <- outlier.facs != 1 + means.gene <- base.means.gene + means.gene[is.outlier] <- outlier.means[is.outlier] + + fData(sim)$BaseGeneMean <- base.means.gene + fData(sim)$OutlierFactor <- outlier.facs + fData(sim)$GeneMean <- means.gene + + return(sim) +} + +#' Simulate group differential expression +#' +#' Simulate differential expression. Differential expression factors for each +#' group are produced using \code{\link{getLNormFactors}} and these are added +#' along with updated means for each group. For paths care is taked to make sure +#' they are simualated in the correct order. +#' +#' @param sim SCESet to add differential expression to. +#' @param params splatParams object with simulation parameters. +#' +#' @return SCESet with simulated differential expression. +#' +#' @name splatSimDE +NULL + +#' @rdname splatSimDE +#' @importFrom Biobase fData +splatSimGroupDE <- function(sim, params) { + + nGenes <- getParam(params, "nGenes") + nGroups <- getParam(params, "nGroups") + de.prob <- getParam(params, "de.prob") + de.downProb <- getParam(params, "de.downProb") + de.facLoc <- getParam(params, "de.facLoc") + de.facScale <- getParam(params, "de.facScale") + means.gene <- fData(sim)$GeneMean + + for (idx in 1:nGroups) { + de.facs <- getLNormFactors(nGenes, de.prob[idx], de.downProb[idx], + de.facLoc[idx], de.facScale[idx]) + group.means.gene <- means.gene * de.facs + fData(sim)[[paste0("DEFacGroup", idx)]] <- de.facs + fData(sim)[[paste0("GeneMeanGroup", idx)]] <- group.means.gene + } + + return(sim) +} + +#' @rdname splatSimDE +#' @importFrom Biobase fData +splatSimPathDE <- function(sim, params) { + + nGenes <- getParam(params, "nGenes") + de.prob <- getParam(params, "de.prob") + de.downProb <- getParam(params, "de.downProb") + de.facLoc <- getParam(params, "de.facLoc") + de.facScale <- getParam(params, "de.facScale") + path.from <- getParam(params, "path.from") + + path.order <- getPathOrder(path.from) + for (path in path.order) { + from <- path.from[path] + if (from == 0) { + means.gene <- fData(sim)$GeneMean + } else { + means.gene <- fData(sim)[[paste0("GeneMeanPath", from)]] + } + de.facs <- getLNormFactors(nGenes, de.prob[path], de.downProb[path], + de.facLoc[path], de.facScale[path]) + path.means.gene <- means.gene * de.facs + fData(sim)[[paste0("DEFacPath", path)]] <- de.facs + fData(sim)[[paste0("GeneMeanPath", path)]] <- path.means.gene + } + + return(sim) +} + +#' Simulate cell means +#' +#' Simulate a gene by cell matrix giving the mean expression for each gene in +#' each cell. Cells start with the mean expression for the group they belong to +#' (when simulating groups) or cells are assigned the mean expression from a +#' random position on the appropriate path (when simulating paths). The selected +#' means are adjusted for each cell's expected library size. +#' +#' @param sim SCESet to add cell means to. +#' @param params SplatParams object with simulation parameters. +#' +#' @return SCESet with added cell means. +#' +#' @name splatSimCellMeans +NULL + +#' @rdname splatSimCellMeans +#' @importFrom Biobase fData pData assayData assayData<- +splatSimSingleCellMeans <- function(sim, params) { + + nCells <- getParam(params, "nCells") + cell.names <- pData(sim)$Cell + gene.names <- fData(sim)$Gene + exp.lib.sizes <- pData(sim)$ExpLibSize + + cell.means.gene <- as.matrix(fData(sim)[, rep("GeneMean", nCells)]) + cell.props.gene <- t(t(cell.means.gene) / colSums(cell.means.gene)) + base.means.cell <- t(t(cell.props.gene) * exp.lib.sizes) + colnames(base.means.cell) <- cell.names + rownames(base.means.cell) <- gene.names + + assayData(sim)$BaseCellMeans <- base.means.cell + + return(sim) +} + +#' @rdname splatSimCellMeans +#' @importFrom Biobase fData pData assayData assayData<- +splatSimGroupCellMeans <- function(sim, params) { + + nGroups <- getParam(params, "nGroups") + cell.names <- pData(sim)$Cell + gene.names <- fData(sim)$Gene + groups <- pData(sim)$Group + group.names <- unique(groups) + exp.lib.sizes <- pData(sim)$ExpLibSize + + group.means.gene <- fData(sim)[, paste0("GeneMean", group.names)] + cell.means.gene <- as.matrix(group.means.gene[, factor(groups)]) + cell.props.gene <- t(t(cell.means.gene) / colSums(cell.means.gene)) + base.means.cell <- t(t(cell.props.gene) * exp.lib.sizes) + colnames(base.means.cell) <- cell.names + rownames(base.means.cell) <- gene.names + + assayData(sim)$BaseCellMeans <- base.means.cell + + return(sim) +} + +#' @rdname splatSimCellMeans +#' @importFrom Biobase fData pData assayData +#' @importFrom stats rbinom +splatSimPathCellMeans <- function(sim, params) { + + nGenes <- getParam(params, "nGenes") + nGroups <- getParam(params, "nGroups") + group.cells <- getParam(params, "groupCells") + path.from <- getParam(params, "path.from") + path.length <- getParam(params, "path.length") + path.skew <- getParam(params, "path.skew") + path.nonlinearProb <- getParam(params, "path.nonlinearProb") + path.sigmaFac <- getParam(params, "path.sigmaFac") + cell.names <- pData(sim)$Cell + gene.names <- fData(sim)$Gene + groups <- pData(sim)$Group + group.names <- unique(groups) + exp.lib.sizes <- pData(sim)$ExpLibSize + + # Generate paths. Each path is a matrix with path.length columns and + # nGenes rows where the expression from each genes changes along the path. + path.steps <- lapply(seq_along(path.from), function(idx) { + from <- path.from[idx] + # Find the means at the starting position + if (from == 0) { + means.start <- fData(sim)$GeneMean + } else { + means.start <- fData(sim)[[paste0("GeneMeanPath", from)]] + } + # Find the means at the end position + means.end <- fData(sim)[[paste0("GeneMeanPath", idx)]] + + # Select genes to follow a non-linear path + is.nonlinear <- as.logical(rbinom(nGenes, 1, path.nonlinearProb)) + sigma.facs <- rep(0, nGenes) + sigma.facs[is.nonlinear] <- path.sigmaFac + # Build Brownian bridges from start to end + steps <- buildBridges(means.start, means.end, n = path.length[idx], + sigma.fac = sigma.facs) + + fData(sim)[[paste0("SigmaFacPath", idx)]] <- sigma.facs + + return(t(steps)) + }) + + # Randomly assign a position in the appropriate path to each cell + cell.steps <- lapply(1:nGroups, function(idx) { + path.probs <- seq(path.skew[idx], 1 - path.skew[idx], + length = path.length[idx]) + path.probs <- path.probs / sum(path.probs) + steps <- sort(sample(1:path.length[idx], group.cells[idx], + prob = path.probs, replace = TRUE)) + + return(steps) + }) + + # Collect the underlying expression levels for each cell + cell.means.gene <- lapply(1:nGroups, function(idx) { + cell.means <- path.steps[[idx]][, cell.steps[[idx]]] + return(cell.means) + }) + cell.means.gene <- do.call(cbind, cell.means.gene) + + # Adjust expression based on library size + cell.props.gene <- t(t(cell.means.gene) / colSums(cell.means.gene)) + base.means.cell <- t(t(cell.props.gene) * exp.lib.sizes) + colnames(base.means.cell) <- cell.names + rownames(base.means.cell) <- gene.names + + pData(sim)$Step <- unlist(cell.steps) + assayData(sim)$BaseCellMeans <- base.means.cell + + return(sim) +} + +#' Simulate BCV means +#' +#' Simulate means for each gene in each cell that are adjusted to follow a +#' mean-variance trend using Biological Coefficient of Variation taken from +#' and inverse gamma distribution. +#' +#' @param sim SCESet to add BCV means to. +#' @param params SplatParams object with simulation parameters. +#' +#' @return SCESet with simulated BCV means. +#' +#' @importFrom Biobase fData pData assayData assayData<- +#' @importFrom stats rchisq rgamma +splatSimBCVMeans <- function(sim, params) { + + nGenes <- getParam(params, "nGenes") + nCells <- getParam(params, "nCells") + bcv.common <- getParam(params, "bcv.common") + bcv.df <- getParam(params, "bcv.df") + cell.names <- pData(sim)$Cell + gene.names <- fData(sim)$Gene + base.means.cell <- assayData(sim)$BaseCellMeans + + bcv <- (bcv.common + (1 / sqrt(base.means.cell))) * + sqrt(bcv.df / rchisq(nGenes, df = bcv.df)) + + means.cell <- matrix(rgamma(nGenes * nCells, shape = 1 / (bcv ^ 2), + scale = base.means.cell * (bcv ^ 2)), + nrow = nGenes, ncol = nCells) + + colnames(bcv) <- cell.names + rownames(bcv) <- gene.names + colnames(means.cell) <- cell.names + rownames(means.cell) <- gene.names + + assayData(sim)$BCV <- bcv + assayData(sim)$CellMeans <- means.cell + + return(sim) +} + +#' Simulate true counts +#' +#' Simulate a true counts matrix. Counts are simulated from a poisson +#' distribution where Each gene in each cell has it's own mean based on the +#' group (or path position), expected library size and BCV. +#' +#' @param sim SCESet to add true counts to. +#' @param params SplatParams object with simulation parameters. +#' +#' @return SCESet with simulated true counts. +#' +#' @importFrom Biobase fData pData assayData +#' @importFrom stats rpois +splatSimTrueCounts <- function(sim, params) { + + nGenes <- getParam(params, "nGenes") + nCells <- getParam(params, "nCells") + cell.names <- pData(sim)$Cell + gene.names <- fData(sim)$Gene + cell.means <- assayData(sim)$CellMeans + + true.counts <- matrix(rpois(nGenes * nCells, lambda = cell.means), + nrow = nGenes, ncol = nCells) + + colnames(true.counts) <- cell.names + rownames(true.counts) <- gene.names + + assayData(sim)$TrueCounts <- true.counts + + return(sim) +} + +#' Simulate dropout +#' +#' A logistic function is used to form a relationshop between the expression +#' level of a gene and the probability of dropout, giving a probability for each +#' gene in each cell. These probabilities are used in a Bernoulli distribution +#' to decide which counts should be dropped. +#' +#' @param sim SCESet to add dropout to. +#' @param params SplatParams object with simulation parameters. +#' +#' @return SCESet with simulated dropout and observed counts. +#' +#' @importFrom Biobase fData pData assayData assayData<- +#' @importFrom stats rbinom +splatSimDropout <- function(sim, params) { + + dropout.present <- getParam(params, "dropout.present") + true.counts <- assayData(sim)$TrueCounts + + if (dropout.present) { + nCells <- getParam(params, "nCells") + nGenes <- getParam(params, "nGenes") + dropout.mid <- getParam(params, "dropout.mid") + dropout.shape <- getParam(params, "dropout.shape") + cell.names <- pData(sim)$Cell + gene.names <- fData(sim)$Gene + cell.means <- assayData(sim)$CellMeans + + # Generate probabilites based on expression + lib.sizes <- colSums(true.counts) + cell.facs <- log(lib.sizes) / median(lib.sizes) + drop.prob <- sapply(1:nCells, function(idx) { + eta <- cell.facs[idx] * (log(cell.means[, idx])) + return(logistic(eta, x0 = dropout.mid, k = dropout.shape)) + }) + + # Decide which counts to keep + keep <- matrix(rbinom(nCells * nGenes, 1, 1 - drop.prob), + nrow = nGenes, ncol = nCells) + + counts <- true.counts * keep + + colnames(drop.prob) <- cell.names + rownames(drop.prob) <- gene.names + colnames(keep) <- cell.names + rownames(keep) <- gene.names + + assayData(sim)$DropProb <- drop.prob + assayData(sim)$Dropout <- !keep + + } else { + counts <- true.counts + } + + scater::counts(sim) <- counts + + return(sim) +} + +#' Get log-normal factors +#' +#' Randomly generate multiplication factors from a log-normal distribution. +#' +#' @param n.facs Number of factors to generate. +#' @param sel.prob Probability that a factor will be selected to be different +#' from 1. +#' @param neg.prob Probability that a selected factor is less than one. +#' @param fac.loc Location parameter for the log-normal distribution. +#' @param fac.scale Scale factor for the log-normal distribution. +#' +#' @return Vector containing generated factors. +#' +#' @importFrom stats rbinom rlnorm +getLNormFactors <- function(n.facs, sel.prob, neg.prob, fac.loc, fac.scale) { + + is.selected <- as.logical(rbinom(n.facs, 1, sel.prob)) + n.selected <- sum(is.selected) + dir.selected <- (-1) ^ rbinom(n.selected, 1, neg.prob) + facs.selected <- rlnorm(n.selected, fac.loc, fac.scale) + # Reverse directions for factors that are less than one + dir.selected[facs.selected < 1 & dir.selected == -1] <- 1 + dir.selected[facs.selected < 1 & dir.selected == 1] <- -1 + factors <- rep(1, n.facs) + factors[is.selected] <- facs.selected ^ dir.selected + + return(factors) +} + +#' Get path order +#' +#' Identify the correct order to process paths so that preceding paths have +#' already been simulated. +#' +#' @param path.from vector giving the path endpoints that each path orginates +#' from. +#' +#' @return Vector giving the order to process paths in. +getPathOrder <- function(path.from) { + + # Transform the vector into a list of (from, to) pairs + path.pairs <- list() + for (idx in 1:length(path.from)) { + path.pairs[[idx]] <- c(path.from[idx], idx) + } + + # Determine the processing order + # If a path is in the "done" vector any path originating here can be + # completed + done <- 0 + while (length(path.pairs) > 0) { + path.pair <- path.pairs[[1]] + path.pairs <- path.pairs[-1] + from <- path.pair[1] + to <- path.pair[2] + if (from %in% done) { + done <- c(done, to) + } else { + path.pairs <- c(path.pairs, list(path.pair)) + } + } + + # Remove the origin from the vector + done <- done[-1] + + return(done) +} + +#' Brownian bridge +#' +#' Calculate a smoothed Brownian bridge between two points. A Brownian bridge is +#' a random walk with fixed end points. +#' +#' @param x starting value. +#' @param y end value. +#' @param N number of steps in random walk. +#' @param n number of points in smoothed bridge. +#' @param sigma.fac multiplier specifying how extreme each step can be. +#' +#' @return Vector of length n following a path from x to y. +#' +#' @importFrom stats runif rnorm +bridge <- function (x = 0, y = 0, N = 5, n = 100, sigma.fac = 0.8) { + + dt <- 1 / (N - 1) + t <- seq(0, 1, length = N) + sigma2 <- runif(1, 0, sigma.fac * mean(c(x, y))) + X <- c(0, cumsum(rnorm(N - 1, sd = sigma2) * sqrt(dt))) + BB <- x + X - t * (X[N] - y + x) + BB <- akima::aspline(BB, n = n)$y + BB[BB < 0] <- 0 + + return(BB) +} +buildBridges <- Vectorize(bridge, vectorize.args = c("x", "y", "sigma.fac")) \ No newline at end of file diff --git a/man/bridge.Rd b/man/bridge.Rd new file mode 100644 index 0000000..b269c4a --- /dev/null +++ b/man/bridge.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splat-simulate.R +\name{bridge} +\alias{bridge} +\title{Brownian bridge} +\usage{ +bridge(x = 0, y = 0, N = 5, n = 100, sigma.fac = 0.8) +} +\arguments{ +\item{x}{starting value.} + +\item{y}{end value.} + +\item{N}{number of steps in random walk.} + +\item{n}{number of points in smoothed bridge.} + +\item{sigma.fac}{multiplier specifying how extreme each step can be.} +} +\value{ +Vector of length n following a path from x to y. +} +\description{ +Calculate a smoothed Brownian bridge between two points. A Brownian bridge is +a random walk with fixed end points. +} + diff --git a/man/estSplatMeans.Rd b/man/estSplatMean.Rd similarity index 87% rename from man/estSplatMeans.Rd rename to man/estSplatMean.Rd index 6b03250..37e26e0 100644 --- a/man/estSplatMeans.Rd +++ b/man/estSplatMean.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/splat-estimate.R -\name{estSplatMeans} -\alias{estSplatMeans} +\name{estSplatMean} +\alias{estSplatMean} \title{Estimate Splatter mean parameters} \usage{ -estSplatMeans(norm.counts, params) +estSplatMean(norm.counts, params) } \arguments{ \item{norm.counts}{library size normalised counts matrix.} diff --git a/man/estimateSimpleParams.Rd b/man/estimateSimpleParams.Rd index b20c902..7b1c33f 100644 --- a/man/estimateSimpleParams.Rd +++ b/man/estimateSimpleParams.Rd @@ -6,15 +6,15 @@ \alias{estimateSimpleParams.matrix} \title{Estimate simple simulation parameters} \usage{ -estimateSimpleParams(data, params = newSimpleParams()) +estimateSimpleParams(counts, params = newSimpleParams()) -\method{estimateSimpleParams}{SCESet}(data, params = newSimpleParams()) +\method{estimateSimpleParams}{SCESet}(counts, params = newSimpleParams()) -\method{estimateSimpleParams}{matrix}(data, params = newSimpleParams()) +\method{estimateSimpleParams}{matrix}(counts, params = newSimpleParams()) } \arguments{ -\item{data}{either a counts matrix or an SCESet object containing count data -to estimate parameters from.} +\item{counts}{either a counts matrix or an SCESet object containing count +data to estimate parameters from.} \item{params}{SimpleParams object to store estimated values in.} } diff --git a/man/estimateSplatParams.Rd b/man/estimateSplatParams.Rd index 5d8b042..f93bae6 100644 --- a/man/estimateSplatParams.Rd +++ b/man/estimateSplatParams.Rd @@ -6,15 +6,15 @@ \alias{estimateSplatParams.matrix} \title{Estimate Splatter simulation parameters} \usage{ -estimateSplatParams(data, params = newSplatParams()) +estimateSplatParams(counts, params = newSplatParams()) -\method{estimateSplatParams}{SCESet}(data, params = newSplatParams()) +\method{estimateSplatParams}{SCESet}(counts, params = newSplatParams()) -\method{estimateSplatParams}{matrix}(data, params = newSplatParams()) +\method{estimateSplatParams}{matrix}(counts, params = newSplatParams()) } \arguments{ -\item{data}{either a counts matrix or an SCESet object containing count data -to estimate parameters from.} +\item{counts}{either a counts matrix or an SCESet object containing count +data to estimate parameters from.} \item{params}{SplatParams object to store estimated values in.} } @@ -23,14 +23,17 @@ SplatParams object containing the estimated parameters. } \description{ Estimate simulation parameters for the Splatter simulation from a real -dataset. -} -\details{ - +dataset. See the individual estimation functions for more details on how this +is done. } \examples{ data("sc_example_counts") params <- estimateSplatParams(sc_example_counts) params } +\seealso{ +\code{\link{estSplatMean}}, \code{\link{estSplatLib}}, +\code{\link{estSplatOutlier}}, \code{\link{estSplatBCV}}, +\code{\link{estSplatDropout}} +} diff --git a/man/getLNormFactors.Rd b/man/getLNormFactors.Rd new file mode 100644 index 0000000..5da9ff6 --- /dev/null +++ b/man/getLNormFactors.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splat-simulate.R +\name{getLNormFactors} +\alias{getLNormFactors} +\title{Get log-normal factors} +\usage{ +getLNormFactors(n.facs, sel.prob, neg.prob, fac.loc, fac.scale) +} +\arguments{ +\item{n.facs}{Number of factors to generate.} + +\item{sel.prob}{Probability that a factor will be selected to be different +from 1.} + +\item{neg.prob}{Probability that a selected factor is less than one.} + +\item{fac.loc}{Location parameter for the log-normal distribution.} + +\item{fac.scale}{Scale factor for the log-normal distribution.} +} +\value{ +Vector containing generated factors. +} +\description{ +Randomly generate multiplication factors from a log-normal distribution. +} + diff --git a/man/getPathOrder.Rd b/man/getPathOrder.Rd new file mode 100644 index 0000000..d59ecfb --- /dev/null +++ b/man/getPathOrder.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splat-simulate.R +\name{getPathOrder} +\alias{getPathOrder} +\title{Get path order} +\usage{ +getPathOrder(path.from) +} +\arguments{ +\item{path.from}{vector giving the path endpoints that each path orginates +from.} +} +\value{ +Vector giving the order to process paths in. +} +\description{ +Identify the correct order to process paths so that preceding paths have +already been simulated. +} + diff --git a/man/splatSimBCVMeans.Rd b/man/splatSimBCVMeans.Rd new file mode 100644 index 0000000..08c7c28 --- /dev/null +++ b/man/splatSimBCVMeans.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splat-simulate.R +\name{splatSimBCVMeans} +\alias{splatSimBCVMeans} +\title{Simulate BCV means} +\usage{ +splatSimBCVMeans(sim, params) +} +\arguments{ +\item{sim}{SCESet to add BCV means to.} + +\item{params}{SplatParams object with simulation parameters.} +} +\value{ +SCESet with simulated BCV means. +} +\description{ +Simulate means for each gene in each cell that are adjusted to follow a +mean-variance trend using Biological Coefficient of Variation taken from +and inverse gamma distribution. +} + diff --git a/man/splatSimCellMeans.Rd b/man/splatSimCellMeans.Rd new file mode 100644 index 0000000..698aa62 --- /dev/null +++ b/man/splatSimCellMeans.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splat-simulate.R +\name{splatSimCellMeans} +\alias{splatSimCellMeans} +\alias{splatSimGroupCellMeans} +\alias{splatSimPathCellMeans} +\alias{splatSimSingleCellMeans} +\title{Simulate cell means} +\usage{ +splatSimSingleCellMeans(sim, params) + +splatSimGroupCellMeans(sim, params) + +splatSimPathCellMeans(sim, params) +} +\arguments{ +\item{sim}{SCESet to add cell means to.} + +\item{params}{SplatParams object with simulation parameters.} +} +\value{ +SCESet with added cell means. +} +\description{ +Simulate a gene by cell matrix giving the mean expression for each gene in +each cell. Cells start with the mean expression for the group they belong to +(when simulating groups) or cells are assigned the mean expression from a +random position on the appropriate path (when simulating paths). The selected +means are adjusted for each cell's expected library size. +} + diff --git a/man/splatSimDE.Rd b/man/splatSimDE.Rd new file mode 100644 index 0000000..b1729d9 --- /dev/null +++ b/man/splatSimDE.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splat-simulate.R +\name{splatSimDE} +\alias{splatSimDE} +\alias{splatSimGroupDE} +\alias{splatSimPathDE} +\title{Simulate group differential expression} +\usage{ +splatSimGroupDE(sim, params) + +splatSimPathDE(sim, params) +} +\arguments{ +\item{sim}{SCESet to add differential expression to.} + +\item{params}{splatParams object with simulation parameters.} +} +\value{ +SCESet with simulated differential expression. +} +\description{ +Simulate differential expression. Differential expression factors for each +group are produced using \code{\link{getLNormFactors}} and these are added +along with updated means for each group. For paths care is taked to make sure +they are simualated in the correct order. +} + diff --git a/man/splatSimDropout.Rd b/man/splatSimDropout.Rd new file mode 100644 index 0000000..2fe71cb --- /dev/null +++ b/man/splatSimDropout.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splat-simulate.R +\name{splatSimDropout} +\alias{splatSimDropout} +\title{Simulate dropout} +\usage{ +splatSimDropout(sim, params) +} +\arguments{ +\item{sim}{SCESet to add dropout to.} + +\item{params}{SplatParams object with simulation parameters.} +} +\value{ +SCESet with simulated dropout and observed counts. +} +\description{ +A logistic function is used to form a relationshop between the expression +level of a gene and the probability of dropout, giving a probability for each +gene in each cell. These probabilities are used in a Bernoulli distribution +to decide which counts should be dropped. +} + diff --git a/man/splatSimGeneMeans.Rd b/man/splatSimGeneMeans.Rd new file mode 100644 index 0000000..531f2b7 --- /dev/null +++ b/man/splatSimGeneMeans.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splat-simulate.R +\name{splatSimGeneMeans} +\alias{splatSimGeneMeans} +\title{Simulate gene means} +\usage{ +splatSimGeneMeans(sim, params) +} +\arguments{ +\item{sim}{SCESet to add gene means to.} + +\item{params}{SplatParams object with simulation parameters.} +} +\value{ +SCESet with simulated gene means. +} +\description{ +Simulate gene means from a gamma distribution. Also simulates outlier +expression factors. Genes with an outlier factor not equal to 1 are replaced +with the median mean expression multiplied by the outlier factor. +} + diff --git a/man/splatSimLibSizes.Rd b/man/splatSimLibSizes.Rd new file mode 100644 index 0000000..4903d21 --- /dev/null +++ b/man/splatSimLibSizes.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splat-simulate.R +\name{splatSimLibSizes} +\alias{splatSimLibSizes} +\title{Simulate library sizes} +\usage{ +splatSimLibSizes(sim, params) +} +\arguments{ +\item{sim}{SCESet to add library size to.} + +\item{params}{SplatParams object with simulation parameters.} +} +\value{ +SCESet with simulated library sizes. +} +\description{ +Simulate expected library sizes from a log-normal distribution +} + diff --git a/man/splatSimTrueCounts.Rd b/man/splatSimTrueCounts.Rd new file mode 100644 index 0000000..7ba61a7 --- /dev/null +++ b/man/splatSimTrueCounts.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splat-simulate.R +\name{splatSimTrueCounts} +\alias{splatSimTrueCounts} +\title{Simulate true counts} +\usage{ +splatSimTrueCounts(sim, params) +} +\arguments{ +\item{sim}{SCESet to add true counts to.} + +\item{params}{SplatParams object with simulation parameters.} +} +\value{ +SCESet with simulated true counts. +} +\description{ +Simulate a true counts matrix. Counts are simulated from a poisson +distribution where Each gene in each cell has it's own mean based on the +group (or path position), expected library size and BCV. +} + diff --git a/man/splatSimulate.Rd b/man/splatSimulate.Rd new file mode 100644 index 0000000..abb44a6 --- /dev/null +++ b/man/splatSimulate.Rd @@ -0,0 +1,131 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splat-simulate.R +\name{splatSimulate} +\alias{splatSimulate} +\alias{splatSimulateGroups} +\alias{splatSimulatePaths} +\alias{splatSimulateSingle} +\title{Splatter simulation} +\usage{ +splatSimulate(params = newSplatParams(), method = c("single", "groups", + "paths"), verbose = TRUE, ...) + +splatSimulateSingle(params = newSplatParams(), verbose = TRUE, ...) + +splatSimulateGroups(params = newSplatParams(), verbose = TRUE, ...) + +splatSimulatePaths(params = newSplatParams(), verbose = TRUE, ...) +} +\arguments{ +\item{params}{SplatParams object containing parameters for the simulation. +See \code{\link{SplatParams}} for details.} + +\item{method}{which simulation method to use. Options are "single" which +produces a single population, "groups" which produces distinct groups +(eg. cell types) or "paths" which selects cells from continuous +trajectories (eg. differentiation processes).} + +\item{verbose}{logical. Whether to print progress messages.} + +\item{...}{any additional parameter settings to override what is provided in +\code{params}.} +} +\value{ +SCESet object containing the simulated counts and intermediate +values. +} +\description{ +Simulate count data from a fictional single-cell RNA-seq experiment using +the Splatter method. +} +\details{ +Parameters can be set in a variety of ways. If no parameters are provided +the default parameters are used. Any parameters in \code{params} can be +overridden by supplying additional arguments through a call to +\code{\link{setParams}}. This design allows the user flexibility in +how they supply parameters and allows small adjustments without creating a +new \code{SplatParams} object. See examples for a demonstration of how this +can be used. + +The simulation involves the following steps: +\enumerate{ + \item Set up simulation object + \item Simulate library sizes + \item Simulate gene means + \item Simulate groups/paths + \item Simulate BCV adjusted cell means + \item Simulate true counts + \item Simulate dropout + \item Create final SCESet object +} + +The final output is an \code{\link[scater]{SCESet}} object that contains the +simulated counts but also the values for various intermediate steps. These +are stored in the \code{\link[Biobase]{phenoData}} (for cell specific +information), \code{\link[Biobase]{featureData}} (for gene specific +information) or \code{\link[Biobase]{assayData}} (for gene by cell matrices) +slots. This additional information includes: +\describe{ + \item{\code{phenoData}}{ + \describe{ + \item{Cell}{Unique cell identifier.} + \item{Group}{The group or path the cell belongs to.} + \item{ExpLibSize}{The expected library size for that cell.} + \item{Step (paths only)}{how far along the path each cell is.} + } + } + \item{\code{featureData}}{ + \describe{ + \item{Gene}{Unique gene identifier.} + \item{BaseGeneMean}{The base expression level for that gene.} + \item{OutlierFactor}{Expression outlier factor for that gene. Values + of 1 indicate the gene is not an expression outlier.} + \item{GeneMean}{Expression level after applying outlier factors.} + \item{DEFac[Group]}{The differential expression factor for each gene + in a particular group. Values of 1 indicate the gene is not + differentially expressed.} + \item{GeneMean[Group]}{Expression level of a gene in a particular + group after applying differential expression factors.} + } + } + \item{\code{assayData}}{ + \describe{ + \item{BaseCellMeans}{The expression of genes in each cell adjusted for + expected library size.} + \item{BCV}{The Biological Coefficient of Variation for each gene in + each cell.} + \item{CellMeans}{The expression level of genes in each cell adjusted + for BCV.} + \item{TrueCounts}{The simulated counts before dropout.} + \item{Dropout}{Logical matrix showing which values have been dropped + in which cells.} + } + } +} + +Values that have been added by Splatter are named using \code{CamelCase} in +order to differentiate them from the values added by Scater which uses +\code{underscore_naming}. +} +\examples{ +# Simulation with default parameters +sim <- splatSimulate() +# Simulation with different number of genes +sim <- splatSimulate(nGenes = 1000) +# Simulation with custom parameters +params <- newSplatParams(nGenes = 100, mean.rate = 0.5) +sim <- splatSimulate(params) +# Simulation with adjusted custom parameters +sim <- splatSimulate(params, mean.rate = 0.6, out.prob = 0.2) +# Simulate groups +sim <- splatSimulate(method = "groups") +# Simulate paths +sim <- splatSimulate(method = "paths") +} +\seealso{ +\code{\link{splatSimLibSizes}}, \code{\link{splatSimGeneMeans}}, +\code{\link{splatSimDE}}, \code{\link{splatSimCellMeans}}, +\code{\link{splatSimBCVMeans}}, \code{\link{splatSimTrueCounts}}, +\code{\link{splatSimDropout}} +} + -- GitLab