Skip to content
Snippets Groups Projects
lun2-estimate.R 7.74 KiB
Newer Older
  • Learn to ignore specific revisions
  • Luke Zappia's avatar
    Luke Zappia committed
    #' Estimate Lun2 simulation parameters
    #'
    #' Estimate simulation parameters for the Lun2 simulation from a real dataset.
    #'
    #' @param counts either a counts matrix or an SCESet object containing count
    #'        data to estimate parameters from.
    #' @param plates integer vector giving the plate that each cell originated from.
    #' @param params Lun2Params object to store estimated values in.
    #' @param min.size minimum size of clusters when identifying group of cells in
    #'        the data.
    #' @param verbose logical. Whether to show progress messages.
    
    Luke Zappia's avatar
    Luke Zappia committed
    #' @param BPPARAM A \code{\link[BiocParallel]{BiocParallelParam}} instance
    
    Luke Zappia's avatar
    Luke Zappia committed
    #'        giving the parallel back-end to be used. Default is
    #'        \code{\link[BiocParallel]{SerialParam}} which uses a single core.
    
    Luke Zappia's avatar
    Luke Zappia committed
    #'
    #' @details
    #' See \code{\link{Lun2Params}} for more details on the parameters.
    #'
    #' @return LunParams object containing the estimated parameters.
    #'
    #' @examples
    #' \dontrun{
    #' data("sc_example_counts")
    #' data("sc_example_cell_info")
    #' plates <- factor(sc_example_cell_info$Mutation_Status)
    #' params <- lun2Estimate(sc_example_counts, plates, min.size = 20)
    #' params
    #' }
    
    Luke Zappia's avatar
    Luke Zappia committed
    #' @importFrom BiocParallel bplapply SerialParam
    
    Luke Zappia's avatar
    Luke Zappia committed
    #' @export
    lun2Estimate <- function(counts, plates, params = newLun2Params(),
    
    Luke Zappia's avatar
    Luke Zappia committed
                             min.size = 200, verbose = TRUE,
                             BPPARAM = SerialParam()) {
    
    Luke Zappia's avatar
    Luke Zappia committed
        UseMethod("lun2Estimate")
    }
    
    #' @rdname lun2Estimate
    #' @export
    lun2Estimate.SCESet <- function(counts, plates, params = newLun2Params(),
    
    Luke Zappia's avatar
    Luke Zappia committed
                                    min.size = 200, verbose = TRUE,
                                    BPPARAM = SerialParam()) {
    
    Luke Zappia's avatar
    Luke Zappia committed
        counts <- scater::counts(counts)
    
        lun2Estimate(counts, plates, params, min.size = min.size, verbose = verbose)
    
    Luke Zappia's avatar
    Luke Zappia committed
    }
    
    #' @rdname lun2Estimate
    #' @importFrom stats model.matrix
    
    #' @importFrom locfit locfit
    
    Luke Zappia's avatar
    Luke Zappia committed
    #' @export
    lun2Estimate.matrix <- function(counts, plates, params = newLun2Params(),
    
    Luke Zappia's avatar
    Luke Zappia committed
                                    min.size = 200, verbose = TRUE,
                                    BPPARAM = SerialParam()) {
    
    Luke Zappia's avatar
    Luke Zappia committed
    
        # Check suggested packages
        if (!requireNamespace("scran", quietly = TRUE)) {
            stop("The Lun2 simulation requires the 'scran' package for estimation.")
        }
    
        if (!requireNamespace("lme4", quietly = TRUE)) {
            stop("The Lun2 simulation requires the 'lme4' package for estimation.")
        }
    
        if (!requireNamespace("pscl", quietly = TRUE)) {
            stop("The Lun2 simulation requires the 'pscl' package for estimation.")
        }
    
        progress <- FALSE
        if (requireNamespace("progress", quietly = TRUE)) {
            progress <- TRUE
        }
        progress <- progress && verbose
    
        checkmate::assertClass(params, "Lun2Params")
        checkmate::assertInt(min.size, lower = 20, upper = length(plates))
        checkmate::assertIntegerish(plates, len = ncol(counts))
    
        if (length(unique(plates)) < 2) {
            stop("Plates must contain at least 2 values.")
        }
    
        dge <- edgeR::DGEList(counts)
        dge <- edgeR::`[.DGEList`(dge, rowMeans(dge$counts) >= 1, )
    
        # Estimate how many groups there are in the data
        if (verbose) {message("Estimating number of groups...")}
        groups <- scran::quickCluster(dge$counts, min.size)
        # Calculate normalisation factors
        if (verbose) {message("Computing normalisation factors...")}
        # Get the sizes for normalisation based on the number of cells in each
        # cluster
        min.cluster.size <- min(table(groups))
        if (min.cluster.size < 20) {
            sizes <- 10
        } else if (min.cluster.size < 100) {
            sizes <- seq(20, min.size, 20)
        } else {
            sizes <- seq(20, 100, 20)
        }
        sum.facs <- scran::computeSumFactors(dge$counts, cluster = groups,
                                             sizes = sizes)
        dge$samples$norm.factors <- sum.facs / dge$samples$lib.size
        # Mean centre normalisation factors
        dge$samples$norm.factors <- dge$samples$norm.factors /
            exp(mean(log(dge$samples$norm.factors)))
    
        # Estimating the NB dispersion (assuming sufficient residual d.f. to
        # estimate the dispersion without EB shrinkage).
        if (verbose) {message("Estimating dispersions...")}
        plateX <- model.matrix(~plates)
        dge <- edgeR::estimateDisp(dge, plateX, prior.df = 0, trend = "none")
    
        # Estimating the log-overall mean
        if (verbose) {message("Estimating gene means...")}
        centered.off <- edgeR::getOffset(dge)
        centered.off <- centered.off - mean(centered.off)
        logmeans <- edgeR::mglmOneGroup(dge$counts, offset = centered.off,
                                        dispersion = dge$tagwise.dispersion)
    
        # Estimating the plate effect variance
        if (verbose) {message("Estimating plate effects...")}
        if (progress) {
            pb.format <- "[:bar] :percent eta: :eta"
            pb <- progress::progress_bar$new(format = pb.format,
                                             total = nrow(dge), clear = FALSE)
            pb$tick(0)
        } else if (verbose) {
            message("This may take some time. Install 'progress' to see a ",
                    "progress bar.")
        }
        # As well as errors glmer produces warnings. Stop these showing because we
        # expect them.
        suppressWarnings(
    
    Luke Zappia's avatar
    Luke Zappia committed
        collected <- bplapply(seq_len(nrow(dge)), function(i) {
    
    Luke Zappia's avatar
    Luke Zappia committed
            if (progress) {pb$tick()}
            tryCatch({
    
                out <- lme4::glmer(
                           Counts ~ 0 + (1 | Plate) + offset(log(sum.facs)),
                           data = data.frame(Counts = as.integer(counts[i, ]),
                                             Group = groups,
                                             Plate = plates),
                           family = lme4::negative.binomial(1 / dge$tagwise[i]))
    
    Luke Zappia's avatar
    Luke Zappia committed
                output <- unlist(lme4::VarCorr(out))
                return(output)
            }, error = function(err) {
                output <- NA_real_
                return(output)
            })
    
    Luke Zappia's avatar
    Luke Zappia committed
        }, BPPARAM = BPPARAM))
    
    Luke Zappia's avatar
    Luke Zappia committed
        sigma2 <- mean(unlist(collected), na.rm = TRUE)
    
        # Repeating the estimation of the dispersion with ZINB models.
        if (verbose) {message("Estimating zero-inflated parameters...")}
        zinb.prop <- rep(-Inf, nrow(dge))
        zinb.disp <- dge$tagwise.dispersion
        zinb.mean <- exp(logmeans)
        nonzeros <- which(rowSums(dge$counts == 0) > 0)
        if (progress) {
            pb <- progress::progress_bar$new(format = "[:bar] :percent eta: :eta",
                                             total = length(nonzeros),
                                             clear = FALSE)
            pb$tick(0)
        } else if (verbose) {
            message("This may take some time. Install 'progress' to see a ",
                    "progress bar.")
        }
    
    Luke Zappia's avatar
    Luke Zappia committed
        zinb.ests <- bplapply(nonzeros, function(i) {
    
    Luke Zappia's avatar
    Luke Zappia committed
            if (progress) {pb$tick()}
    
    Luke Zappia's avatar
    Luke Zappia committed
            zinb.est <- c(mean = zinb.mean[i], prop = zinb.prop[i],
                          disp = zinb.disp[i])
    
    Luke Zappia's avatar
    Luke Zappia committed
            tryCatch({
                zfit <- pscl::zeroinfl(dge$count[i, ] ~ 0 + plates | 1,
                                       dist = "negbin", offset = log(sum.facs))
    
    Luke Zappia's avatar
    Luke Zappia committed
                zinb.est <- c(mean = mean(exp(zfit$coefficients$count)),
                              prop = unname(zfit$coefficients$zero),
                              disp = 1 / zfit$theta)
    
    Luke Zappia's avatar
    Luke Zappia committed
            }, error = function(err) {})
    
    Luke Zappia's avatar
    Luke Zappia committed
            return(zinb.est)
        }, BPPARAM = BPPARAM)
    
        zinb.ests <- do.call("rbind", zinb.ests)
    
        zinb.prop[nonzeros] <- zinb.ests[, "prop"]
        zinb.disp[nonzeros] <- zinb.ests[, "disp"]
        zinb.mean[nonzeros] <- zinb.ests[, "mean"]
    
    
    Luke Zappia's avatar
    Luke Zappia committed
        zinb.prop <- exp(zinb.prop) / (1 + exp(zinb.prop))
    
        params <- setParams(params, nGenes = length(logmeans),
                            cell.plates = plates, plate.var = sigma2,
                            gene.means = exp(logmeans),
                            gene.disps = dge$tagwise.dispersion,
                            gene.ziMeans = zinb.mean, gene.ziDisps = zinb.disp,
                            gene.ziProps = zinb.prop,
                            cell.libSizes = dge$samples$lib.size)
    
        return(params)