Skip to content
Snippets Groups Projects
Commit d9e0eb8e authored by Jeffrey Pullin's avatar Jeffrey Pullin
Browse files

Economise simulated data analyses

parent 1c91d21d
No related branches found
No related tags found
No related merge requests found
Pipeline #10011 passed
---
title: "Simulated marker genes"
output: html_document
---
```{r setup, include = FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r libraries, messages = FALSE}
library(dplyr)
library(tibble)
library(ggplot2)
library(scater)
library(purrr)
library(splatter)
library(scran)
source(here::here("code", "analysis-utils.R"))
```
```{r}
#' Find candidate marker genes from a dataset simulated using splatter
#'
#' @param sce A SingleCellExperiment object simulated using splatter
#'
#' @details TODO
#'
find_marker_genes <- function(sce) {
stopifnot(is(sce, "SingleCellExperiment"))
n_groups <- length(unique(colData(sce)$Group))
col_names <- paste0("DEFacGroup", seq_len(n_groups))
data <- as.matrix(rowData(sce)[, col_names])
out <- list()
for (i in seq_len(n_groups)) {
n_genes <- nrow(data)
not_i <- setdiff(1:n_groups, i)
mean_scores <- numeric(n_genes)
max_scores <- numeric(n_genes)
median_scores <- numeric(n_genes)
directions <- character(n_genes)
de_values <- numeric(n_genes)
for (j in seq_len(n_genes)) {
y <- data[j, not_i]
x <- data[j, i]
log_ratios <- log(x / y)
mean_scores[[j]] <- mean(log_ratios)
median_scores[[j]] <- median(log_ratios)
max_scores[[j]] <- max(log_ratios)
de_values[[j]] <- x
if (x > 1) {
directions[[j]] <- "up"
} else if (x == 1) {
directions[[j]] <- "background"
} else {
directions[[j]] <- "down"
}
}
out[[i]] <- tibble::tibble(
gene = rowData(sce)$Gene,
gene_mean = rowData(sce)$BaseGeneMean,
# FIXME: Kept for backwards compatibility.
fc = mean_scores,
mean_score = mean_scores,
median_score = median_scores,
max_score = max_scores,
direction = directions,
de_value = de_values
)
}
names(out) <- paste0("group_", seq_len(n_groups))
out
}
```
```{r load-marker-genes}
pars <- retrive_simulation_parameters() %>%
filter(sim_label == "standard") %>%
rowwise() %>%
mutate(umg_path = here::here(
"data", "sim_mgs", paste0("mg-", sim_name, "-", data_id, ".rds"))
) %>%
rowwise() %>%
mutate(mgs = list(readRDS(umg_path))) %>%
ungroup() %>%
filter(pars == "seurat_poisson") %>%
filter(rep == 1)
zeisel_mgs <- pars %>%
filter(data_id == "zeisel") %>%
pull(mgs) %>%
pluck(1) %>%
lapply(function(x) get_top_true_mgs(x, direction = "up")) %>%
# Work with the first cluster.
pluck(1) %>%
as_tibble()
pbmc3k_mgs <- pars %>%
filter(data_id == "pbmc3k") %>%
pull(mgs) %>%
pluck(1) %>%
lapply(function(x) get_top_true_mgs(x, direction = "up",
sort_by_score = "mean_score")) %>%
# Work with the first cluster.
pluck(1) %>%
as_tibble()
lawlor_mgs <- pars %>%
filter(data_id == "lawlor") %>%
pull(mgs) %>%
pluck(1) %>%
lapply(function(x) get_top_true_mgs(x, direction = "up")) %>%
# Work with the first cluster.
pluck(1) %>%
as_tibble()
```
```{r load-simulated-data}
sim_data_folder <- here::here("data", "sim_data")
pbmc3k_1 <- readRDS(file.path(sim_data_folder, "standard_sim_1-pbmc3k.rds"))
zeisel_1 <- readRDS(file.path(sim_data_folder, "standard_sim_1-zeisel.rds"))
lawlor_1 <- readRDS(file.path(sim_data_folder, "standard_sim_1-lawlor.rds"))
```
```{r pbmc3k-top-10}
plotExpression(pbmc3k_1, features = pbmc3k_mgs$gene[1:20], x = "label")
rowData(pbmc3k_1) %>%
as_tibble() %>%
filter(Gene == "Gene931")
rowData(pbmc3k_1) %>%
as_tibble() %>%
filter(Gene == "Gene1080")
rowData(pbmc3k_1) %>%
as_tibble() %>%
filter(OutlierFactor > 1 & DEFacGroup1 < 1)
plotExpression(pbmc3k_1, features = c("Gene370", "Gene1980"), x = "label")
readRDS(here::here("results", "sim_data", "standard_sim_1-pbmc3k-seurat_wilcox.rds"))
seurat_t <- readRDS(
here::here("results", "sim_data", "standard_sim_1-pbmc3k-seurat_t.rds")
)
seurat_t %>%
pluck("result") %>%
filter(log_fc < 0)
plotExpression(pbmc3k_1, x = "label", features = "Gene1980")
find_marker_genes(pbmc3k_1) %>%
pluck(1) %>%
filter(gene == "Gene931")
rowSums(counts(pbmc3k_1[281, ]))
```
```{r pbmc3k-top-genes}
plotExpression(pbmc3k_1, features = pbmc3k_mgs$gene[1:10], x = "label")
plotExpression(pbmc3k_1, features = pbmc3k_mgs$gene[50:60], x = "label")
```
```{r}
means_1 <- rowMeans(counts(pbmc3k_1[pbmc3k_mgs$gene, pbmc3k_1$label == "Group1"]))
means_2 <- rowMeans(counts(pbmc3k_1[pbmc3k_mgs$gene, pbmc3k_1$label != "Group1"]))
log_fcs <- log(means_1 / means_2)
tibble(rank = 1:length(log_fcs), log_fc = log_fcs) %>%
ggplot(aes(x = rank, y = log_fc)) +
geom_point(colour = "seagreen") +
labs(
x = "True marker gene rank",
y = "Two sample log fold change"
) +
theme_bw()
```
```{r ziesel}
plotExpression(zeisel_1, features = zeisel_mgs$gene[1:10], x = "label")
plotExpression(zeisel_1, features = zeisel_mgs$gene[50:60], x = "label")
# plotExpression(zeisel_1, features = zeisel_mgs$gene[100:110], x = "label")
```
```{r lawlor}
plotExpression(lawlor_1, features = lawlor_mgs$gene[1:5], x = "label")
plotExpression(lawlor_1, features = lawlor_mgs$gene[50:55], x = "label")
# plotExpression(lawlor_1, features = lawlor_mgs$gene[100:105], x = "label")
```
```{r}
pbmc3k <- readRDS(here::here("data", "real_data", "pbmc3k.rds"))
calculate_mean_diff <- function(sce, cluster_label) {
counts <- counts(sce)
x <- as.numeric(colLabels(sce) == cluster_label)
n_genes <- nrow(sce)
out <- numeric(n_genes)
for (i in seq_len(n_genes)) {
y <- counts[i, ]
y_1 <- y[x == 0]
y_2 <- y[x == 1]
out[[i]] <- mean(y_1) - mean(y_2)
}
out
}
pbmc3k_mean_diff <- calculate_mean_diff(pbmc3k, "Group1")
data.frame(mean_diff = pbmc3k_mean_diff) %>%
ggplot(aes(mean_diff)) +
geom_histogram(bins = 30)
```
```{r}
# pbmc3k_sim <- readRDS(
# here::here("data", "sim_data", "standard_sim_1-pbmc3k.rds")
# )
#
#
# mean_diff <- numeric(nrow(pbmc3k_sim))
# for (i in 1:nrow(pbmc3k_sim)) {
# print(i)
# mean_1 <- median(counts(pbmc3k_sim[i, pbmc3k_sim$label == "Group1"]))
# mean_2 <- median(counts(pbmc3k_sim[i, pbmc3k_sim$label != "Group1"]))
# mean_diff[[i]] <- mean_1 - mean_2
# }
#
# umgs$mean_diff <- mean_diff
#
# umgs %>%
# as_tibble() %>%
# #arrange(mean_diff) %>%
# arrange(desc(abs(mean_score)))
#
# plotExpression(pbmc3k_sim, x = "label", "Gene281")
```
```{r}
# pbmc3k <- readRDS(here::here("data", "real_data", "pbmc3k.rds"))
# pbmc3k_subset <- pbmc3k[, pbmc3k$label == "B"]
#
# sim_data <- pbmc3k_subset
#
# sim_counts <- as.matrix(counts(sim_data))
# params <- splatEstimate(sim_counts)
#
# params <- setParam(params, "group.prob", rep(1 / 5, 5))
# params <- setParam(params, "batchCells", 2000)
# params <- setParam(params, "seed", 101)
# params <- setParam(params, "de.facLoc", 4)
# params <- setParam(params, "de.facScale", 0.2)
#
# sim <- splatSimulateGroups(params)
# clusters <- quickCluster(sim)
# sim <- computeSumFactors(sim, clusters = clusters)
# sim <- logNormCounts(sim)
# colLabels(sim) <- colData(sim)$Group
#
# umgs <- find_marker_genes(sim)
#
# umgs[[1]] %>%
# as_tibble() %>%
# arrange(desc(abs(mean_score))) %>%
# print(n = 40)
#
# plotExpression(sim, x = "label", "Gene770")
#
# log_fc <- numeric(nrow(sim))
# for (i in 1:nrow(sim)) {
# print(i)
# mean_1 <- mean(counts(sim[i, sim$label == "Group1"]))
# mean_2 <- mean(counts(sim[i, sim$label != "Group1"]))
# log_fc[[i]] <- log(mean_1 / mean_2)
# }
#
# hist(log_fc)
#
# median_diff <- numeric(nrow(sim))
# for (i in 1:nrow(sim)) {
# print(i)
# median_1 <- median(logcounts(sim[i, sim$label == "Group1"]))
# median_2 <- median(logcounts(sim[i, sim$label != "Group1"]))
# median_diff[[i]] <- median_1 - median_2
# }
#
# hist(median_diff)
#
#
# sum(abs(median_diff) > 2)
#
# umgs$median_diff <- median_diff
#
# log_fc_real <- numeric(nrow(pbmc3k))
# for (i in 1:nrow(pbmc3k)) {
# print(i)
# mean_1 <- mean(counts(pbmc3k[i, pbmc3k$label == "B"]))
# mean_2 <- mean(counts(pbmc3k[i, pbmc3k$label != "B"]))
# log_fc_real[[i]] <- log(mean_1 / mean_2)
# }
#
# hist(log_fc_real)
#
# median_diff_real <- numeric(nrow(pbmc3k))
# for (i in 1:nrow(pbmc3k)) {
# print(i)
# median_1 <- median(logcounts(pbmc3k[i, pbmc3k$label == "B"]))
# median_2 <- median(logcounts(pbmc3k[i, pbmc3k$label != "B"]))
# median_diff_real[[i]] <- median_1 - median_2
# }
#
# hist(median_diff_real)
# sum(abs(median_diff_real) > 2)
#
# sum(abs(log_fc) > 3, na.rm = TRUE)
# sum(abs(log_fc_real) > 3, na.rm = TRUE)
#
# gene_log_fc <- tibble(gene = rownames(pbmc3k), log_fc = log_fc_real)
# gene_log_fc %>%
# arrange(desc(abs(log_fc))) %>%
# print(n = 40)
#
# gene_median_diff <- tibble(
# gene = rownames(pbmc3k),
# median_diff = median_diff_real
# )
# gene_median_diff %>%
# arrange(desc(abs(median_diff))) %>%
# print(n = 20)
#
# gene_median_diff %>%
# filter(gene == "MS4A1")
#
# gene_log_fc %>%
# filter(gene == "MS4A1")
#
#
# plotExpression(pbmc3k, x = "label", "CD79A")
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment