Add preliminary analysis

4ffcef4a · Jeffrey Pullin · b6d217b2 · 4ffcef4a
Commit 4ffcef4a authored 4 years ago by Jeffrey Pullin
--- a/analysis/method-concordance.Rmd
+++ b/analysis/method-concordance.Rmd
+---
+title: "Method concordance"
+author: "Jeffrey Pullin"
+date: "25/10/2020"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+### Aim 
+
+The aim of this analysis is to establish how concordant the various methods are (how well they agree) on typical datasets.
+
+### Analysis
+
+```{r libraries}
+library(scran)
+library(scater)
+library(Seurat)
+library(DropletUtils)
+library(reticulate)
+```
+
+First, we load the `pbmc4k` dataset. This dataset is used currently because it is small and easy to obtain. 
+ 
+```{r load-pbmc3k, warning = FALSE, message = FALSE}
+pbmcs <- TENxPBMCData::TENxPBMCData("pbmc4k")
+pbmcs
+```
+
+To perform marker gene analysis we need to have clustered data. To cluster the data we use the following workflow, adapted from the Orchestrating single cell analysis with Bioconductor book:
+
+```{r prepare-data}
+sce.pbmc <- pbmcs
+
+#--- normalization ---#
+set.seed(1000)
+clusters <- quickCluster(sce.pbmc)
+sce.pbmc <- computeSumFactors(sce.pbmc, clusters = clusters)
+sce.pbmc <- logNormCounts(sce.pbmc)
+
+#--- variance-modelling ---#
+set.seed(1001)
+dec.pbmc <- modelGeneVarByPoisson(sce.pbmc)
+top.pbmc <- getTopHVGs(dec.pbmc, prop = 0.1)
+
+#--- dimensionality-reduction ---#
+set.seed(10000)
+sce.pbmc <- denoisePCA(sce.pbmc, subset.row = top.pbmc, technical = dec.pbmc)
+
+set.seed(100000)
+sce.pbmc <- runTSNE(sce.pbmc, dimred = "PCA")
+
+set.seed(1000000)
+sce.pbmc <- runUMAP(sce.pbmc, dimred = "PCA")
+
+#--- clustering ---#
+g <- buildSNNGraph(sce.pbmc, k = 10, use.dimred = 'PCA')
+clust <- igraph::cluster_walktrap(g)$membership
+colLabels(sce.pbmc) <- factor(clust)
+
+pbmcs <- sce.pbmc
+```
+
+We can now run the marker gene methods. We will focus our attention on cluster 9. Note that this cluster is focused on in the Marker Gene chapter of OCSA.  
+
+First we run {scran}:
+
+```{r mg-scran}
+# A bit slow on my MacBook Air. 
+scran_mgs <- findMarkers(pbmcs)
+
+# Focus on the marker genes for cluster 9.
+scran_mgs_9 <- scran_mgs[["9"]]
+scran_mgs_9
+```
+
+Next we can run {Seurat}:
+
+```{r mg-seurat}
+# Seurat requires column names and cannot handle DelayedArray objects. 
+pbmcs2 <- pbmcs
+colnames(pbmcs2) <- paste0("cell_", 1:4340)
+counts(pbmcs2) <- as(counts(pbmcs2), "dgCMatrix")
+logcounts(pbmcs2) <- as(logcounts(pbmcs2), "Matrix")
+
+seurat_data <- as.Seurat(pbmcs2)
+Idents(seurat_data) <- colLabels(pbmcs2)
+seurat_mgs_9 <- FindMarkers(seurat_data, ident.1 = 9)
+```
+
+We can now compare the results from {Seurat} and {scran}. We use the genes with `Top <= 6` for {scran}, that is the top 6 genes in each pairwise comparison, as suggested in OSCA. This gives us 52 genes so we filter the {Seurat} genes to the top 52 based on p-value and check the concordance. 
+
+```{r compare-mgs-9}
+# 52 marker genes
+scran_top <- scran_mgs_9[scran_mgs_9$Top <= 6,][, 1:4]
+seurat_top <- seurat_mgs_9[1:52, ]
+
+scran_top_genes <- row.names(scran_top)
+seurat_top_genes <- row.names(seurat_top)
+
+length(intersect(scran_top_genes, seurat_top_genes))
+
+# 24 marker genes
+scran_top <- scran_mgs_9[scran_mgs_9$Top <= 2,][, 1:4]
+seurat_top <- seurat_mgs_9[1:24, ]
+
+scran_top_genes <- row.names(scran_top)
+seurat_top_genes <- row.names(seurat_top)
+
+length(intersect(scran_top_genes, seurat_top_genes))
+
+# 14 marker genes
+scran_top <- scran_mgs_9[scran_mgs_9$Top <= 1,][, 1:4]
+seurat_top <- seurat_mgs_9[1:14, ]
+
+scran_top_genes <- row.names(scran_top)
+seurat_top_genes <- row.names(seurat_top)
+
+length(intersect(scran_top_genes, seurat_top_genes))
+
+## Sort both by p-value 
+
+scran_mgs_9 <- scran_mgs_9[order(scran_mgs_9$p.value), ]
+# Seurat already sorts by p-value.
+
+n <- 50
+scran_top_genes <- row.names(head(scran_mgs_9, n))
+seurat_top_genes <- row.names(head(seurat_mgs_9, n))
+# V. bad. 
+length(intersect(scran_top_genes, seurat_top_genes))
+```
+
+What about {scanpy}?? 
+
+```{r scanpy}
+sc <- import("scanpy")
+pd <- import("pandas")
+
+adata_sce <- sc$AnnData(
+    X   = t(logcounts(pbmcs2)),
+    obs = as.data.frame(colData(pbmcs2)),
+    var = as.data.frame(rowData(pbmcs2))
+)
+
+sc$tl$rank_genes_groups(adata_sce, "label", method = "wilcoxon")
+sc$pl$rank_genes_groups(adata_sce)
+
+py_run_string("import pandas as pd")
+py_run_string(
+  "scanpy_mgs = pd.DataFrame(r.adata_sce.uns['rank_genes_groups']['names'])"
+)
+# Doesn't have the actual values... but I think it sorted.
+head(py$scanpy_mgs)
+scanpy_mgs_9_genes <- py$scanpy_mgs[["9"]]
+seurat_mgs_9_genes <- row.names(seurat_mgs_9)
+
+n <- 50
+length(intersect(scanpy_mgs_9_genes[1:n], seurat_mgs_9_genes[1:n]))
+
+```