Economise simulated data analyses

d9e0eb8e · Jeffrey Pullin · 1c91d21d · d9e0eb8e · 1c91d21d
Commit d9e0eb8e authored 3 years ago by Jeffrey Pullin
--- a/analysis/sim-data-analysis.Rmd
+++ b/analysis/sim-data-analysis.Rmd
--- a/analysis/simulated-marker-genes.Rmd
+++ b/analysis/simulated-marker-genes.Rmd
---
-title: "Simulated marker genes"
-output: html_document
---
-
-```{r setup, include = FALSE}
-knitr::opts_chunk$set(echo = TRUE)
-```
-
-```{r libraries, messages = FALSE}
-library(dplyr)
-library(tibble)
-library(ggplot2)
-library(scater)
-library(purrr)
-library(splatter)
-library(scran)
-
-source(here::here("code", "analysis-utils.R"))
-```
-
-```{r}
-
-#' Find candidate marker genes from a dataset simulated using splatter
-#'
-#' @param sce A SingleCellExperiment object simulated using splatter
-#'
-#' @details TODO
-#'
-find_marker_genes <- function(sce) {
-
-  stopifnot(is(sce, "SingleCellExperiment"))
-
-  n_groups <- length(unique(colData(sce)$Group))
-  col_names <- paste0("DEFacGroup", seq_len(n_groups))
-  data <- as.matrix(rowData(sce)[, col_names])
-
-  out <- list()
-  for (i in seq_len(n_groups)) {
-    n_genes <- nrow(data)
-    not_i <- setdiff(1:n_groups, i)
-
-    mean_scores <- numeric(n_genes)
-    max_scores <- numeric(n_genes)
-    median_scores <- numeric(n_genes)
-    directions <- character(n_genes)
-    de_values <- numeric(n_genes)
-    for (j in seq_len(n_genes)) {
-      y <- data[j, not_i]
-      x <- data[j, i]
-      log_ratios <- log(x / y)
-      mean_scores[[j]] <- mean(log_ratios)
-      median_scores[[j]] <- median(log_ratios)
-      max_scores[[j]] <- max(log_ratios)
-      de_values[[j]] <- x
-      if (x > 1) {
-        directions[[j]] <- "up"
-      } else if (x == 1) {
-        directions[[j]] <- "background"
-      } else {
-        directions[[j]] <- "down"
-      }
-    }
-    out[[i]] <- tibble::tibble(
-      gene = rowData(sce)$Gene,
-      gene_mean = rowData(sce)$BaseGeneMean,
-      # FIXME: Kept for backwards compatibility.
-      fc = mean_scores,
-      mean_score = mean_scores,
-      median_score = median_scores,
-      max_score = max_scores,
-      direction = directions,
-      de_value = de_values
-    )
-  }
-
-  names(out) <- paste0("group_", seq_len(n_groups))
-  out
-}
-```
-
-```{r load-marker-genes}
-pars <- retrive_simulation_parameters() %>% 
-  filter(sim_label == "standard") %>% 
-  rowwise() %>% 
-  mutate(umg_path = here::here(
-    "data", "sim_mgs", paste0("mg-", sim_name, "-", data_id, ".rds")) 
-  ) %>% 
-  rowwise() %>% 
-  mutate(mgs = list(readRDS(umg_path))) %>% 
-  ungroup() %>% 
-  filter(pars == "seurat_poisson") %>% 
-  filter(rep == 1)
-
-zeisel_mgs <- pars %>% 
-  filter(data_id == "zeisel") %>% 
-  pull(mgs) %>% 
-  pluck(1) %>% 
-  lapply(function(x) get_top_true_mgs(x, direction = "up"))  %>% 
-  # Work with the first cluster.
-  pluck(1) %>% 
-  as_tibble()
-
-pbmc3k_mgs <- pars %>% 
-  filter(data_id == "pbmc3k") %>% 
-  pull(mgs) %>% 
-  pluck(1) %>% 
-  lapply(function(x) get_top_true_mgs(x, direction = "up", 
-                                      sort_by_score = "mean_score"))  %>% 
-  # Work with the first cluster.
-  pluck(1) %>% 
-  as_tibble()
-
-lawlor_mgs <- pars %>% 
-  filter(data_id == "lawlor") %>% 
-  pull(mgs) %>% 
-  pluck(1) %>% 
-  lapply(function(x) get_top_true_mgs(x, direction = "up"))  %>% 
-  # Work with the first cluster.
-  pluck(1) %>% 
-  as_tibble()
-```
-
-```{r load-simulated-data}
-sim_data_folder <- here::here("data", "sim_data")
-pbmc3k_1 <- readRDS(file.path(sim_data_folder, "standard_sim_1-pbmc3k.rds"))
-zeisel_1 <- readRDS(file.path(sim_data_folder, "standard_sim_1-zeisel.rds"))
-lawlor_1 <- readRDS(file.path(sim_data_folder, "standard_sim_1-lawlor.rds"))
-```
-
-```{r pbmc3k-top-10}
-plotExpression(pbmc3k_1, features = pbmc3k_mgs$gene[1:20], x = "label")
-
-rowData(pbmc3k_1) %>% 
-  as_tibble() %>% 
-  filter(Gene == "Gene931")
-
-rowData(pbmc3k_1) %>% 
-  as_tibble() %>% 
-  filter(Gene == "Gene1080")
-
-rowData(pbmc3k_1) %>% 
-  as_tibble() %>% 
-  filter(OutlierFactor > 1 & DEFacGroup1 < 1)
-
-plotExpression(pbmc3k_1, features = c("Gene370", "Gene1980"), x = "label")
-
-readRDS(here::here("results", "sim_data", "standard_sim_1-pbmc3k-seurat_wilcox.rds"))
-seurat_t <- readRDS(
-  here::here("results", "sim_data", "standard_sim_1-pbmc3k-seurat_t.rds")
-)
-
-seurat_t %>% 
-  pluck("result") %>% 
-  filter(log_fc < 0)
-
-
-plotExpression(pbmc3k_1, x = "label", features = "Gene1980")
-
-
-find_marker_genes(pbmc3k_1) %>% 
-  pluck(1) %>% 
-  filter(gene == "Gene931")
-
-rowSums(counts(pbmc3k_1[281, ]))
-```
-
-```{r pbmc3k-top-genes}
-plotExpression(pbmc3k_1, features = pbmc3k_mgs$gene[1:10], x = "label")
-plotExpression(pbmc3k_1, features = pbmc3k_mgs$gene[50:60], x = "label")
-```
-
-```{r}
-means_1 <- rowMeans(counts(pbmc3k_1[pbmc3k_mgs$gene, pbmc3k_1$label == "Group1"]))
-means_2 <- rowMeans(counts(pbmc3k_1[pbmc3k_mgs$gene, pbmc3k_1$label != "Group1"]))
-log_fcs <- log(means_1 / means_2)
-
-tibble(rank = 1:length(log_fcs), log_fc = log_fcs) %>% 
-  ggplot(aes(x = rank, y = log_fc)) + 
-  geom_point(colour = "seagreen") + 
-  labs(
-    x = "True marker gene rank", 
-    y = "Two sample log fold change"
-  ) + 
-  theme_bw()
-```
-
-```{r ziesel}
-plotExpression(zeisel_1, features = zeisel_mgs$gene[1:10], x = "label")
-plotExpression(zeisel_1, features = zeisel_mgs$gene[50:60], x = "label")
-# plotExpression(zeisel_1, features = zeisel_mgs$gene[100:110], x = "label")
-```
-
-```{r lawlor}
-plotExpression(lawlor_1, features = lawlor_mgs$gene[1:5], x = "label")
-plotExpression(lawlor_1, features = lawlor_mgs$gene[50:55], x = "label")
-# plotExpression(lawlor_1, features = lawlor_mgs$gene[100:105], x = "label")
-```
-
-```{r}
-pbmc3k <- readRDS(here::here("data", "real_data", "pbmc3k.rds"))
-
-calculate_mean_diff <- function(sce, cluster_label) {
-  counts <- counts(sce)
-  x <- as.numeric(colLabels(sce) == cluster_label)
-  n_genes <- nrow(sce)
-  
-  out <- numeric(n_genes)
-  for (i in seq_len(n_genes)) {
-    y <- counts[i, ]
-    y_1 <- y[x == 0]
-    y_2 <- y[x == 1]
-    out[[i]] <- mean(y_1) - mean(y_2)
-  }
-  out
-}
-
-pbmc3k_mean_diff <- calculate_mean_diff(pbmc3k, "Group1")
-
-data.frame(mean_diff = pbmc3k_mean_diff) %>% 
-  ggplot(aes(mean_diff)) + 
-  geom_histogram(bins = 30)
-```
-
-```{r}
-# pbmc3k_sim <- readRDS(
-#   here::here("data", "sim_data", "standard_sim_1-pbmc3k.rds")
-# )
-# 
-# 
-# mean_diff <- numeric(nrow(pbmc3k_sim))
-# for (i in 1:nrow(pbmc3k_sim)) {
-#   print(i)
-#   mean_1 <- median(counts(pbmc3k_sim[i, pbmc3k_sim$label == "Group1"]))
-#   mean_2 <- median(counts(pbmc3k_sim[i, pbmc3k_sim$label != "Group1"]))
-#   mean_diff[[i]] <- mean_1 - mean_2
-# }
-# 
-# umgs$mean_diff <- mean_diff
-# 
-# umgs %>% 
-#   as_tibble() %>% 
-#   #arrange(mean_diff) %>% 
-#   arrange(desc(abs(mean_score)))
-# 
-# plotExpression(pbmc3k_sim, x = "label", "Gene281")
-```
-
-```{r}
-# pbmc3k <- readRDS(here::here("data", "real_data", "pbmc3k.rds"))
-# pbmc3k_subset <- pbmc3k[, pbmc3k$label == "B"]
-# 
-# sim_data <- pbmc3k_subset
-# 
-# sim_counts <- as.matrix(counts(sim_data))
-# params <- splatEstimate(sim_counts)
-# 
-# params <- setParam(params, "group.prob", rep(1 / 5, 5))
-# params <- setParam(params, "batchCells", 2000)
-# params <- setParam(params, "seed", 101)
-# params <- setParam(params, "de.facLoc", 4)
-# params <- setParam(params, "de.facScale", 0.2)
-# 
-# sim <- splatSimulateGroups(params)
-# clusters <- quickCluster(sim)
-# sim <- computeSumFactors(sim, clusters = clusters)
-# sim <- logNormCounts(sim)
-# colLabels(sim) <- colData(sim)$Group
-# 
-# umgs <- find_marker_genes(sim)
-# 
-# umgs[[1]] %>% 
-#   as_tibble() %>% 
-#   arrange(desc(abs(mean_score))) %>% 
-#   print(n = 40)
-# 
-# plotExpression(sim, x = "label", "Gene770")
-# 
-# log_fc <- numeric(nrow(sim))
-# for (i in 1:nrow(sim)) {
-#   print(i)
-#   mean_1 <- mean(counts(sim[i, sim$label == "Group1"]))
-#   mean_2 <- mean(counts(sim[i, sim$label != "Group1"]))
-#   log_fc[[i]] <- log(mean_1 / mean_2)
-# }
-# 
-# hist(log_fc)
-# 
-# median_diff <- numeric(nrow(sim))
-# for (i in 1:nrow(sim)) {
-#   print(i)
-#   median_1 <- median(logcounts(sim[i, sim$label == "Group1"]))
-#   median_2 <- median(logcounts(sim[i, sim$label != "Group1"]))
-#   median_diff[[i]] <- median_1 - median_2
-# }
-# 
-# hist(median_diff)
-# 
-# 
-# sum(abs(median_diff) > 2)
-# 
-# umgs$median_diff <- median_diff
-# 
-# log_fc_real <- numeric(nrow(pbmc3k)) 
-# for (i in 1:nrow(pbmc3k)) {
-#   print(i)
-#   mean_1 <- mean(counts(pbmc3k[i, pbmc3k$label == "B"]))
-#   mean_2 <- mean(counts(pbmc3k[i, pbmc3k$label != "B"]))
-#   log_fc_real[[i]] <- log(mean_1 / mean_2)
-# }
-# 
-# hist(log_fc_real)
-# 
-# median_diff_real <- numeric(nrow(pbmc3k)) 
-# for (i in 1:nrow(pbmc3k)) {
-#   print(i)
-#   median_1 <- median(logcounts(pbmc3k[i, pbmc3k$label == "B"]))
-#   median_2 <- median(logcounts(pbmc3k[i, pbmc3k$label != "B"]))
-#   median_diff_real[[i]] <- median_1 - median_2
-# }
-# 
-# hist(median_diff_real)
-# sum(abs(median_diff_real) > 2)
-# 
-# sum(abs(log_fc) > 3, na.rm = TRUE)
-# sum(abs(log_fc_real) > 3, na.rm = TRUE)
-# 
-# gene_log_fc <- tibble(gene = rownames(pbmc3k), log_fc = log_fc_real)
-# gene_log_fc %>% 
-#   arrange(desc(abs(log_fc))) %>% 
-#   print(n = 40)
-# 
-# gene_median_diff <- tibble(
-#   gene = rownames(pbmc3k), 
-#   median_diff = median_diff_real
-# )
-# gene_median_diff %>% 
-#   arrange(desc(abs(median_diff))) %>% 
-#   print(n = 20)
-# 
-# gene_median_diff %>% 
-#   filter(gene == "MS4A1")
-# 
-# gene_log_fc %>% 
-#   filter(gene == "MS4A1")
-# 
-# 
-# plotExpression(pbmc3k, x = "label", "CD79A")
-```