Speed up lm method

6935d277 · Jeffrey Pullin · add2bc55 · 6935d277
Commit 6935d277 authored 3 years ago by Jeffrey Pullin
--- a/code/run-lm.R
+++ b/code/run-lm.R
-#' Find marker genes using a linear model implemented via `lm`.
-#'
-#' Wraps the `stats::lm` function.
+#' Find marker genes using a linear model's i.e. Student's t-test.
 #'
 #' @param sce A SingleCellExperiment object
 #' @param pars parameters passed to the function.
@@ -17,70 +15,47 @@ run_lm <- function(sce, pars) {
    n_groups <- length(unique(colLabels(sce)))
    n_genes <- nrow(sce)

-    x_ungrouped <- factor(as.numeric(colLabels(sce)))
-
    out <- vector("list", length = n_groups)
    for (i in seq_len(n_groups)) {
-      print("Group!")
-

      group <- levels(colLabels(sce))[[i]]
      x <- factor(as.numeric(colLabels(sce) == group))

-      group_out <- vector("list", length = n_genes)
-      for (j in seq_len(n_genes)) {
-
-        print(paste0("Gene", j))
-        gene <- rownames(sce)[[j]]
-        y <- as.vector(logcounts(sce[j, ]))
-
-        if (pars$covariate == "two_sample") {
-
-          grouped_model <- lm(y ~ x)
-          coefs <- summary(grouped_model)$coefficients
-          # Extract values from the fitted model.
-          p_value <- coefs[2, 4]
-          # Bonferonni correction.
-          p_value_adj <- p_value / n_genes
-          t_value <- coefs[2, 3]
-          score <- coefs[2, 1]
-
-        } else if (pars$covariate == "ungrouped") {
-          # FIXME: Hack to prevent computation problems.
-          y_ungrouped <- y + 1
-          ungrouped_model <- lm(y_ungrouped ~ 0 + x_ungrouped)
-          k <- n_groups - 1
-          K_vec <- rep(-1/k, n_groups)
-          K_vec[[j]] <- 1
-          K <- matrix(K_vec, nrow = 1)
-          contrast_fit <- multcomp::glht(ungrouped_model, linfct = K)
-          summary_fit <- summary(contrast_fit)
-          p_value <- summary_fit$test$pvalues[[1]]
-          p_value_adj <- p_value / n_genes
-          t_value <- summary_fit$test$tstat[[1]]
-          # FIXME What should this be?
-          score <- 0
-        } else {
-          stop("Invaild pars value.", call = FALSE)
-        }
+      t_stat <- function(X) {
+        m <- rowMeans(X)
+        n <- ncol(X)
+        var <- rowSums((X - m) ^ 2)

-        group_out[[j]] <- list(
-          p_value = p_value,
-          raw_statistic = score,
-          log_fc = 0,
-          p_value_adj = p_value_adj,
-          gene = gene,
-          cluster = group,
-          scaled_statistic = t_value
-        )
-        group_df <- dplyr::bind_rows(!!!group_out)
-        # This is implicitly Student's t-test so the degrees of freedom
-        # will be same between all genes so sorting on the scaled statistic
-        # i.e. the t-value is okay.
-        group_df <- dplyr::arrange(group_df, desc(abs(scaled_statistic)))
+        list(m = m, n = n, var = var)
      }

-      out[[i]] <- group_df
+      g_1 <- t_stat(logcounts(sce)[, x == 1])
+      g_2 <- t_stat(logcounts(sce)[, x == 0])
+
+      num <- g_1$m - g_2$m
+      s2 <- (g_1$var + g_2$var) / (g_1$n + g_2$n - 2)
+      denom <-  sqrt(s2 * (1 / g_1$n + 1 / g_2$n))
+      scaled_statistic <- num / denom
+
+      p_value <- pt(abs(scaled_statistic), df = g_1$n + g_2$n - 2,
+                    lower.tail = FALSE)
+      p_value_adj <- p_value / n_genes
+
+      group_out <- tibble::tibble(
+        raw_statistic = num,
+        scaled_statistic = scaled_statistic,
+        p_value = p_value,
+        p_value_adj = p_value_adj,
+        log_fc = rep(0, n_genes),
+        gene = rownames(sce),
+        cluster = group
+      )
+
+      # This is implicitly Student's t-test so the degrees of freedom
+      # will be same between all genes so sorting on the scaled statistic
+      # i.e. the t-value is okay.
+      group_out <- dplyr::arrange(group_out, desc(abs(scaled_statistic)))
+      out[[i]] <- group_out
    }

    # Group the results for each group together, preserving the within group