Skip to content
Snippets Groups Projects
Commit 5eb4b4c2 authored by Jeffrey Pullin's avatar Jeffrey Pullin
Browse files

Add initial analysis of FDR curves

parent 33746c69
No related branches found
No related tags found
No related merge requests found
---
title: "FDR curves"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r libraries, message = FALSE, warning = FALSE}
library(tibble)
library(dplyr)
library(ggplot2)
library(tidyr)
library(SingleCellExperiment)
library(pals)
library(forcats)
library(viridis)
```
```{r functions}
source(here::here("code", "top-genes.R"))
source(here::here("code", "find-marker-genes.R"))
source(here::here("code", "analysis-utils.R"))
get_top_true_mgs <- function(mgs, n = "all") {
# FIXME: This sorts only on the fold change which is not ideal for
# semi-unique marker genes
mgs <- mgs[order(mgs$fc, decreasing = TRUE), ]
if (n == "all") {
n <- nrow(mgs)
}
mgs[1:n, ]
}
get_top_sel_mgs <- function(mgs, n = "all") {
if ("top" %in% colnames(mgs)) {
# scran any
# FIXME: Does this matter?
mgs <- mgs[order(mgs$top, mgs$p_value), ]
} else if (all(is.na(mgs$p_value))) {
# Seurat ROC -> no p values!
mgs <- mgs[order(mgs$score, decreasing = TRUE), ]
} else {
mgs <- mgs[order(mgs$p_value), ]
}
if (n == "all") {
n <- nrow(mgs)
}
mgs[1:n, ]
}
#' Calculate the true positive rate
#'
#' @param found The marker genes found.
#' @param true The true marker genes.
#'
#' @return The true positive rate
#'
calculate_tpr <- function(found, true) {
stopifnot(is.vector(found) && is.vector(true))
n_tp <- length(base::intersect(found, true))
n <- length(found)
n_tp / n
}
```
## Aim
To investigate the FDR and TPR performance of the different methods
## Data loading
```{r calculate-fdr}
config <- yaml::read_yaml(here::here("config.yaml"))
res_paths <- here::here(list.files(config$results_folder, full.names = TRUE))
sim_paths <- here::here(list.files(config$sim_data_folder, full.names = TRUE))
# SLOW!
umgs <- list()
sumgs <- list()
for (i in seq_along(sim_paths)) {
print(i)
sim <- readRDS(sim_paths[[i]])
umgs[[i]] <- find_unique_marker_genes(sim)
sumgs[[i]] <- find_semi_unique_marker_genes(sim)
rm(sim)
}
sim_names <- substr(basename(sim_paths), 1, nchar(basename(sim_paths)) - 4)
mgs <- tibble(sim_name = sim_names, umg = umgs, sumg = sumgs)
sel_genes <- list()
for (i in seq_along(res_paths)) {
print(i)
res <- readRDS(res_paths[[i]])
# Some (DESeq2 runs lead to convergence issues.)
if (!(length(res$result) == 0)) {
# We select 700 as there are never more than around 650 true marker genes.
sel_genes[[i]] <- reformat_found_mgs(res, top_n = 700)
}
rm(res)
}
res_names <- substr(basename(res_paths), 1, nchar(basename(res_paths)) - 4)
selected_genes <- tibble(res_name = res_names, sel_genes = sel_genes)
pars <- retrive_simulation_parameters() %>%
mutate(res_name = substr(file_name, 1, nchar(file_name) - 4)) %>%
left_join(mgs, by = "sim_name") %>%
left_join(selected_genes, by = "res_name")
pars <- pars %>%
pivot_longer(cols = c(umg, sumg),
names_to = "mg_type",
values_to = "mgs")
metrics_data <- pars %>%
mutate(clusters = if_else(sim_label == "prop", list(1), list(NULL))) %>%
dplyr::rename(sel_mgs = sel_genes, true_mgs = mgs) %>%
rowwise() %>%
mutate(group_id = list(names(true_mgs))) %>%
unchop(c(sel_mgs, true_mgs, group_id))
```
```{r}
metrics_data %>%
filter(sim_label == "standard_sim" & mg_type == "umg") %>%
expand_grid(n_sel = 1:40, n_true = c(100, 200, 700)) %>%
rowwise() %>%
mutate(
true_mgs = list(get_top_true_mgs(true_mgs, n = n_true)),
sel_mgs = list(get_top_sel_mgs(sel_mgs, n = n_sel)),
tpr = calculate_tpr(sel_mgs$gene, true_mgs$gene),
fdr = 1 - tpr
) %>%
ungroup() %>%
group_by(pars, n_sel, n_true) %>%
summarise(fdr = mean(fdr), .groups = "drop") %>%
ggplot(aes(x = n_sel, y = fdr, colour = pars)) +
geom_line() +
facet_wrap(~n_true) +
scale_colour_manual(values = unname(polychrome(20))) +
labs(
x = "Number of genes selected",
y = "Number of false discoveries",
colour = "Method"
) +
theme_bw()
```
```{r}
metrics_data %>%
filter(sim_label == "standard_sim" & mg_type == "umg") %>%
expand_grid(n_sel = 1:10, n_true = c(100, 200, 700)) %>%
rowwise() %>%
mutate(
true_mgs = list(get_top_true_mgs(true_mgs, n = n_true)),
sel_mgs = list(get_top_sel_mgs(sel_mgs, n = n_sel)),
tpr = calculate_tpr(sel_mgs$gene, true_mgs$gene),
fdr = 1 - tpr
) %>%
ungroup() %>%
group_by(pars, n_sel, n_true) %>%
summarise(fdr = mean(fdr), .groups = "drop") %>%
ggplot(aes(x = n_sel, y = fdr, colour = pars)) +
geom_line() +
facet_wrap(~n_true) +
scale_colour_manual(values = unname(polychrome(20))) +
labs(
x = "Number of genes selected",
y = "Number of false discoveries",
colour = "Method"
) +
theme_bw()
```
```{r}
metrics_data %>%
filter(sim_label == "standard_sim" & mg_type == "umg") %>%
expand_grid(n_sel = c(20, 100, 200), n_true = 1:200) %>%
rowwise() %>%
mutate(
true_mgs = list(get_top_true_mgs(true_mgs, n = n_true)),
sel_mgs = list(get_top_sel_mgs(sel_mgs, n = n_sel)),
tpr = calculate_tpr(sel_mgs$gene, true_mgs$gene),
fdr = 1 - tpr
) %>%
ungroup() %>%
group_by(pars, n_sel, n_true) %>%
summarise(fdr = mean(fdr), .groups = "drop") %>%
ggplot(aes(x = n_true, y = fdr, colour = pars)) +
geom_line() +
facet_wrap(~n_sel) +
coord_cartesian(ylim = c(0, 1)) +
scale_colour_manual(values = unname(polychrome(20))) +
labs(
x = "Number of true genes",
y = "Number of false discoveries",
colour = "Method"
) +
theme_bw()
```
```{r}
metrics_data %>%
filter(sim_label == "num_cells" & mg_type == "umg") %>%
expand_grid(n_sel = 1:20, n_true = 100) %>%
rowwise() %>%
filter(!is.null(sel_mgs)) %>%
mutate(
true_mgs = list(get_top_true_mgs(true_mgs, n = n_true)),
sel_mgs = list(get_top_sel_mgs(sel_mgs, n = n_sel)),
tpr = calculate_tpr(sel_mgs$gene, true_mgs$gene),
fdr = 1 - tpr
) %>%
ungroup() %>%
group_by(pars, n_sel, n_true, batchCells) %>%
summarise(fdr = mean(fdr), .groups = "drop") %>%
ggplot(aes(x = n_sel, y = fdr, colour = pars)) +
geom_line() +
facet_wrap(~batchCells) +
scale_colour_manual(values = unname(polychrome(20))) +
labs(
x = "Number of genes selected",
y = "Number of false discoveries",
colour = "Method"
) +
theme_bw()
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment