From 95b8e073121c7b368343991e6f25816c90892c77 Mon Sep 17 00:00:00 2001
From: Davis McCarthy <davismcc@gmail.com>
Date: Wed, 2 Oct 2019 22:56:30 +1000
Subject: [PATCH] Updating DE, imputation and projection chapters

---
 course_files/de-real.Rmd    | 174 +++++++++--
 course_files/imputation.Rmd |  52 ++--
 course_files/projection.Rmd | 592 ++++++++++++++++++++++++------------
 course_files/search.Rmd     | 243 ---------------
 4 files changed, 578 insertions(+), 483 deletions(-)
 delete mode 100644 course_files/search.Rmd

diff --git a/course_files/de-real.Rmd b/course_files/de-real.Rmd
index 2e56699..ff961c8 100644
--- a/course_files/de-real.Rmd
+++ b/course_files/de-real.Rmd
@@ -4,7 +4,7 @@ output: html_document
 ---
 
 ```{r setup, echo=FALSE}
-knitr::opts_chunk$set(out.width='90%', fig.align = 'center', eval=FALSE)
+knitr::opts_chunk$set(out.width='90%', fig.align = 'center', eval=TRUE, message=FALSE, warning=FALSE)
 knitr::opts_knit$set(root.dir = normalizePath(".."))
 ```
 
@@ -13,7 +13,7 @@ knitr::opts_knit$set(root.dir = normalizePath(".."))
 ```{r, echo=TRUE, message=FALSE, warning=FALSE}
 library(scRNA.seq.funcs)
 library(edgeR)
-library(monocle)
+library(limma)
 library(MAST)
 library(ROCR)
 set.seed(1)
@@ -21,10 +21,13 @@ set.seed(1)
 
 ### Introduction
 
-To test different single-cell differential expression methods we will be using the Blischak dataset from Chapters 7-17.
-For this experiment bulk RNA-seq data for each cell-line was generated in addition to single-cell data. We will use the
-differentially expressed genes identified using standard methods on the respective bulk data as the ground truth for evaluating the
-accuracy of each single-cell method. To save time we have pre-computed these for you. You can run the commands below to load these data.
+To test different single-cell differential expression methods we will be using
+the Blischak dataset from Chapters 7-17. For this experiment bulk RNA-seq data
+for each cell-line was generated in addition to single-cell data. We will use
+the differentially expressed genes identified using standard methods on the
+respective bulk data as the ground truth for evaluating the accuracy of each
+single-cell method. To save time we have pre-computed these for you. You can run
+the commands below to load these data.
 
 ```{r}
 DE <- read.table("data/tung/TPs.txt")
@@ -35,7 +38,8 @@ GroundTruth <- list(
 )
 ```
 
-This ground truth has been produce for the comparison of individual NA19101 to NA19239. Now load the respective single-cell data:
+This ground truth has been produce for the comparison of individual NA19101 to
+NA19239. Now load the respective single-cell data:
 
 ```{r}
 molecules <- read.table("data/tung/molecules.txt", sep = "\t")
@@ -46,17 +50,19 @@ group <- anno[keep,1]
 batch <- anno[keep,4]
 # remove genes that aren't expressed in at least 6 cells
 gkeep <- rowSums(data > 0) > 5;
-counts <- data[gkeep,]
+counts_mat <- as.matrix(data[gkeep,])
 # Library size normalization
-lib_size = colSums(counts)
-norm <- t(t(counts)/lib_size * median(lib_size)) 
+lib_size <- colSums(counts_mat)
+norm <- t(t(counts_mat)/lib_size * median(lib_size)) 
 # Variant of CPM for datasets with library sizes of fewer than 1 mil molecules
 ```
 
 Now we will compare various single-cell DE methods. We will focus on methods
-that performed well in Soneson and Robinson's [2019; CITE] detailed comparison
-of differential expression methods for single-cell data. Note that we will only
-be running methods which are available as R-packages and run relatively quickly.
+that performed well in Soneson and Robinson's [@Soneson2018-hy] detailed
+comparison of differential expression methods for single-cell data. Note that we
+will only be running methods which are available as R-packages and run
+relatively quickly.
+
 
 ### Kolmogorov-Smirnov test
 
@@ -103,10 +109,14 @@ length(sigDE)
 sum(GroundTruth$DE %in% sigDE) 
 # Number of KS-DE genes that are true DE genes
 sum(GroundTruth$notDE %in% sigDE)
-# Number of KS-DE genes that are truly not-DE
+                                        # Number of KS-DE genes that are truly not-DE
 ```
 
-As you can see many more of our ground truth negative genes were identified as DE by the KS-test (false positives) than ground truth positive genes (true positives), however this may be due to the larger number of notDE genes thus we typically normalize these counts as the True positive rate (TPR), TP/(TP + FN), and False positive rate (FPR), FP/(FP+TP).
+As you can see many more of our ground truth negative genes were identified as
+DE by the KS-test (false positives) than ground truth positive genes (true
+positives), however this may be due to the larger number of notDE genes thus we
+typically normalize these counts as the True positive rate (TPR), TP/(TP + FN),
+and false discovery rate (FPR), FP/(FP+TP).
 
 ```{r}
 tp <- sum(GroundTruth$DE %in% sigDE)
@@ -114,12 +124,18 @@ fp <- sum(GroundTruth$notDE %in% sigDE)
 tn <- sum(GroundTruth$notDE %in% names(pVals)[pVals >= 0.05])
 fn <- sum(GroundTruth$DE %in% names(pVals)[pVals >= 0.05])
 tpr <- tp/(tp + fn)
-fpr <- fp/(fp + tn)
-cat(c(tpr, fpr))
+fdr <- fp/(fp + tp)
+cat(c(tpr, fdr), "\n")
 ```
-Now we can see the TPR is much higher than the FPR indicating the KS test is identifying DE genes.
 
-So far we've only evaluated the performance at a single significance threshold. Often it is informative to vary the threshold and evaluate performance across a range of values. This is then plotted as a receiver-operating-characteristic curve (ROC) and a general accuracy statistic can be calculated as the area under this curve (AUC). We will use the ROCR package to facilitate this plotting.
+Now we can see the TPR is OK, but the FDR is very high--- much higher than we
+would normally deem acceptable.
+
+So far we've only evaluated the performance at a single significance threshold.
+Often it is informative to vary the threshold and evaluate performance across a
+range of values. This is then plotted as a receiver-operating-characteristic
+curve (ROC) and a general accuracy statistic can be calculated as the area under
+this curve (AUC). We will use the ROCR package to facilitate this plotting.
 
 ```{r ks-roc-plot, fig.cap="ROC curve for KS-test."}
 # Only consider genes for which we know the ground truth
@@ -133,24 +149,33 @@ ROCR::plot(perf)
 aucObj <- ROCR::performance(pred, "auc")
 aucObj@y.values[[1]] # AUC
 ```
-Finally to facilitate the comparisons of other DE methods let's put this code into a function so we don't need to repeat it:
+
+Finally to facilitate the comparisons of other DE methods let's put this code
+into a function so we don't need to repeat it:
 
 ```{r}
-DE_Quality_AUC <- function(pVals) {
+DE_Quality_AUC <- function(pVals, plot=TRUE) {
     pVals <- pVals[names(pVals) %in% GroundTruth$DE | 
                    names(pVals) %in% GroundTruth$notDE]
     truth <- rep(1, times = length(pVals));
     truth[names(pVals) %in% GroundTruth$DE] = 0;
     pred <- ROCR::prediction(pVals, truth)
     perf <- ROCR::performance(pred, "tpr", "fpr")
-    ROCR::plot(perf)
+    if (plot)
+      ROCR::plot(perf)
     aucObj <- ROCR::performance(pred, "auc")
     return(aucObj@y.values[[1]])
 }
 ```
-### Wilcox/Mann-Whitney-U Test
 
-The Wilcox-rank-sum test is another non-parametric test, but tests specifically if values in one group are greater/less than the values in the other group. Thus it is often considered a test for difference in median expression between two groups; whereas the KS-test is sensitive to any change in distribution of expression values.
+
+### Wilcoxon/Mann-Whitney-U Test
+
+The Wilcoxon-rank-sum test is another non-parametric test, but tests
+specifically if values in one group are greater/less than the values in the
+other group. Thus it is often considered a test for difference in median
+expression between two groups; whereas the KS-test is sensitive to any change in
+distribution of expression values.
 
 ```{r wilcox-plot, fig.cap="ROC curve for Wilcox test.", message=FALSE, warning=FALSE}
 pVals <- apply(
@@ -173,17 +198,34 @@ We've already used edgeR for differential expression in Chapter
 gene expression and uses a generalized linear model (GLM) framework, the enables
 us to include other factors such as batch to the model.
 
-```{r edger-plot, fig.cap="ROC curve for edgeR.", message=FALSE}
+`edgeR`, like many DE methods work best with pre-filtering of very lowly
+experessed genes. Here, we will keep genes with non-zero expression in at least
+30 cells.
+
+The first steps for this analysis then involve 
+
+```{r edger-plot, fig.cap="Biological coefficient of variation plot for edgeR.", message=FALSE}
+keep_gene <- (rowSums(counts_mat > 0) > 29.5 & rowMeans(counts_mat) > 0.2)
+table(keep_gene)
 dge <- DGEList(
-    counts = counts, 
-    norm.factors = rep(1, length(counts[1,])), 
+    counts_mat = counts_mat[keep_gene,], 
+    norm.factors = rep(1, length(counts_mat[1,])), 
     group = group
 )
 group_edgeR <- factor(group)
 design <- model.matrix(~ group_edgeR)
-dge <- estimateDisp(dge, design = design, trend.method = "none")
-fit <- glmFit(dge, design)
-res <- glmLRT(fit)
+dge <- calcNormFactors(dge, method = "TMM")
+summary(dge$samples$norm.factors)
+dge <- estimateDisp(dge, design = design)
+plotBCV(dge)
+```
+
+Next we fit a negative binomial quasi-likelihood model for differential
+expression.
+
+```{r edger-plot, fig.cap="ROC curve for edgeR.", message=FALSE}
+fit <- glmQLFit(dge, design)
+res <- glmQLFTest(fit)
 pVals <- res$table[,4]
 names(pVals) <- rownames(res$table)
 
@@ -200,7 +242,7 @@ using a hurdle model to combine tests of discrete (0 vs not zero) and continuous
 framework to enable complex models to be considered.
 
 ```{r MAST-plot, fig.cap="ROC curve for MAST.", message=FALSE}
-log_counts <- log(counts + 1) / log(2)
+log_counts <- log2(counts_mat + 1)
 fData <- data.frame(names = rownames(log_counts))
 rownames(fData) <- rownames(log_counts);
 cData <- data.frame(cond = group)
@@ -211,7 +253,7 @@ colData(obj)$cngeneson <- scale(colSums(assay(obj) > 0))
 cond <- factor(colData(obj)$cond)
 
 # Model expression as function of condition & number of detected genes
-zlmCond <- zlm.SingleCellAssay(~ cond + cngeneson, obj) 
+zlmCond <- zlm(~ cond + cngeneson, obj) 
 
 summaryCond <- summary(zlmCond, doLRT = "condNA19101")
 summaryDt <- summaryCond$datatable
@@ -226,10 +268,78 @@ DE_Quality_AUC(pVals)
 
 ### limma
 
+The `limma` package implements Empirical Bayes linear models for identifying
+differentially expressed genes. Originally developed more microarray data, the
+extended limma-voom method has proven to be a very fast and accurate DE method
+for bulk RNA-seq data. With its speed and flexibility, `limma` also performed
+well for DE analysis of scRNA-seq data in the benchmarking study mentioned
+earlier.
+
+Here we demonstrate how to use `limma` for DE analysis on these data.
+
+```{r limma-de}
+library(limma)
+## we can use the same design matrix structure as for edgeR
+design <- model.matrix(~ group_edgeR)
+colnames(design)
+## apply the same filtering of low-abundance genes as for edgeR
+keep_gene <- (rowSums(counts_mat > 0) > 29.5 & rowMeans(counts_mat) > 0.2)
+summary(keep_gene)
+y <- counts_mat[keep_gene, ]
+v <- voom(y, design)
+fit <- lmFit(v, design)
+res <- eBayes(fit)
+topTable(res)
+pVals <- res$p.value
+names(pVals) <- rownames(y)
+pVals <- p.adjust(pVals, method = "fdr")
+DE_Quality_AUC(pVals)
+```
+
+__Q:__ How does `limma` compare on this dataset?
 
 
 ### Pseudobulk
 
+In this approach, we sum counts across cells from the same condition (here we
+will use batch within individual) to create "pseudobulk" samples that we will
+analyse for DE the way we would analyse bulk RNA-seq data. Research continues on
+benchmarking pseudobulk and single-cell DE approaches, but the suggestion from
+certain (we think reliable) sources is that pseudobulk approaches are more
+robust and accurate (and typically very fast) in a range of settings where we're
+interested in differences between groups or conditions rather than individual
+cells.
+
+```{r pseudobulk}
+summed <- scater::sumCountsAcrossCells(counts_mat, factor(batch))
+head(summed)
+y <- DGEList(summed)
+y <- y[aveLogCPM(y) > 1,]
+y <- calcNormFactors(y)
+individ <- gsub("\\.r[123]", "", colnames(summed))
+design <- model.matrix(~individ)
+head(design)
+y <- estimateDisp(y, design)
+summary(y$trended.dispersion)
+plotBCV(y)
+fit <- glmQLFit(y)
+res <- glmQLFTest(fit)
+summary(decideTests(res))
+pVals <- res$table[,4]
+names(pVals) <- rownames(res$table)
+pVals <- p.adjust(pVals, method = "fdr")
+DE_Quality_AUC(pVals)
+```
+
+__Q:__ What do you make of these pseudobulk results? What are the pros and cons
+of this approach?
+
+### Reflection
+
+* What is your overall assessment of the DE methods presented in this chapter? 
+* When might you choose to use one method in preference to others?
+* What caveats or potential problems can you think of with conducting DE testing
+  using approaches like those presented here?
 
 ### sessionInfo()
 
diff --git a/course_files/imputation.Rmd b/course_files/imputation.Rmd
index c6e1200..6389fd5 100644
--- a/course_files/imputation.Rmd
+++ b/course_files/imputation.Rmd
@@ -3,7 +3,7 @@ output: html_document
 ---
 
 ```{r setup, echo=FALSE}
-knitr::opts_chunk$set(out.width='90%', fig.align = 'center', eval=FALSE)
+knitr::opts_chunk$set(out.width='90%', fig.align = 'center', eval=TRUE, message=FALSE, warning=FALSE)
 knitr::opts_knit$set(root.dir = normalizePath(".."))
 ```
 
@@ -24,17 +24,21 @@ As discussed previously, one of the main challenges when analyzing scRNA-seq
 data is the presence of zeros, or dropouts. The dropouts are assumed to have 
 arisen for three possible reasons:
 
-* The gene was not expressed in the cell and hence there are no transcripts to sequence
-* The gene was expressed, but for some reason the transcripts were lost somewhere prior to sequencing
-* The gene was expressed and transcripts were captured and turned into cDNA, but the sequencing depth was not sufficient to produce any reads.
-
-Thus, dropouts could be result of experimental shortcomings, and if this is 
-the case then we would like to provide computational corrections. One possible
- solution is to impute the dropouts in the expression matrix. To be able to 
-impute gene expression values, one must have an underlying model. However, 
-since we do not know which dropout events are technical artefacts and which 
-correspond to the transcript being truly absent, imputation is a difficult 
-challenge and prone to creating [false-positive results in downstream analysis](https://f1000research.com/articles/7-1740/v2).
+* The gene was not expressed in the cell and hence there are no transcripts to
+  sequence
+* The gene was expressed, but for some reason the transcripts were lost
+  somewhere prior to sequencing
+* The gene was expressed and transcripts were captured and turned into cDNA, but
+  the sequencing depth was not sufficient to produce any reads.
+
+Thus, dropouts could be result of experimental shortcomings, and if this is the
+case then we would like to provide computational corrections. One possible
+solution is to impute the dropouts in the expression matrix. To be able to
+impute gene expression values, one must have an underlying model. However, since
+we do not know which dropout events are technical artefacts and which correspond
+to the transcript being truly absent, imputation is a difficult challenge and
+prone to creating [false-positive results in downstream
+analysis](https://f1000research.com/articles/7-1740/v2).
 
 There are many different imputation methods available we will consider three 
 fast, published methods:
@@ -50,7 +54,9 @@ imputed all inflated zeros.
 
 ### scImpute
 
-To test `scImpute`, we use the default parameters and we apply it to the Deng dataset that we have worked with before. scImpute takes a .csv or .txt file as an input:
+To test `scImpute`, we use the default parameters and we apply it to the Deng
+dataset that we have worked with before. scImpute takes a .csv or .txt file as
+an input:
 
 ```{r message=FALSE, warning=FALSE, paged.print=FALSE}
 deng <- readRDS("data/deng/deng-reads.rds")
@@ -81,17 +87,19 @@ plotPCA(
 )
 ```
 
-Compare this result to the original data in Chapter \@ref(clust-methods). What are the most significant differences?
+Compare this result to the original data in Chapter \@ref(clust-methods). What
+are the most significant differences?
 
 We can examine the expression of specific genes to directly see the effect of
  imputation on the expression distribution.
+ 
 ```{r}
 plotExpression(res, c("Sox2", "Eomes", "Zscan4d", "Fgf4"))
 plotExpression(deng, c("Sox2", "Eomes", "Zscan4d", "Fgf4"))
 ```
 
-
 To evaluate the impact of the imputation, we use `SC3` to cluster the imputed matrix
+
 ```{r message=FALSE, warning=FALSE, paged.print=FALSE}
 res <- sc3_estimate_k(res)
 metadata(res)$sc3$k_estimation
@@ -107,11 +115,14 @@ __Exercise:__ Based on the PCA and the clustering results, do you think that imp
 
 ### DrImpute
 
-We can do the same for DrImpute. DrImpute runs on a log-normalized expression matrix directly in R, we generate this matrix using scater, then run DrImpute. Unlike scImpute, DrImpute considers the consensus imputation across a range of ks using two differ correlation distances:
+We can do the same for DrImpute. DrImpute runs on a log-normalized expression
+matrix directly in R, we generate this matrix using scater, then run DrImpute.
+Unlike scImpute, DrImpute considers the consensus imputation across a range of
+ks using two differ correlation distances:
 
 ```{r}
 deng <- normalize(deng)
-res <- DrImpute(deng@assays[["logcounts"]], ks=8:12)
+res <- DrImpute(logcounts(deng), ks=8:12)
 colnames(res) <- colnames(deng)
 rownames(res) <- rownames(deng)
 res <- SingleCellExperiment(
@@ -125,10 +136,13 @@ plotPCA(
 )
 plotExpression(res, c("Sox2", "Eomes", "Zscan4d", "Fgf4"))
 ```
-__Exercise:__ Check the sc3 clustering of the DrImpute matrix, do you think that imputation using `DrImpute` is a good idea for the Deng dataset?
+
+__Exercise:__ Check the sc3 clustering of the DrImpute matrix, do you think that
+imputation using `DrImpute` is a good idea for the Deng dataset?
 
 
-__Exercise:__ What is the difference between `scImpute` and `DrImpute` based on the PCA and clustering analysis? Which one do you think is best to use?
+__Exercise:__ What is the difference between `scImpute` and `DrImpute` based on
+the PCA and clustering analysis? Which one do you think is best to use?
 
 
 ### sessionInfo()
diff --git a/course_files/projection.Rmd b/course_files/projection.Rmd
index 37cc805..0d4488a 100644
--- a/course_files/projection.Rmd
+++ b/course_files/projection.Rmd
@@ -3,7 +3,7 @@ output: html_document
 ---
 
 ```{r setup, echo=FALSE}
-knitr::opts_chunk$set(out.width='90%', fig.align = 'center', eval=FALSE)
+knitr::opts_chunk$set(out.width='90%', fig.align = 'center', eval=TRUE, warning=FALSE, message=FALSE)
 knitr::opts_knit$set(root.dir = normalizePath(".."))
 ```
 
@@ -20,21 +20,27 @@ library(scater)
 library(SingleCellExperiment)
 ```
 
-### Introduction
-
-As more and more scRNA-seq datasets become available, carrying merged_seurat comparisons between them is key. There are two main
-approaches to comparing scRNASeq datasets. The first approach is "label-centric" which is focused on trying
- to identify equivalent cell-types/states across datasets by comparing individual cells or groups of cells. The other approach
- is "cross-dataset normalization" which attempts to computationally remove experiment-specific technical/biological effects
-so that data from multiple experiments can be combined and jointly analyzed.
-
-The label-centric approach can be used with dataset with high-confidence cell-annotations, e.g. the Human Cell Atlas (HCA)
-[@Regev2017-mw] or the Tabula Muris [@Quake2017] once they are completed, to project cells or clusters from a new sample onto
-this reference to consider tissue composition and/or identify cells with novel/unknown identity. Conceptually, such
-projections are similar to the popular BLAST method [@Altschul1990-ts], which makes it possible to quickly find the closest
-match in a database for a newly identified nucleotide or amino acid sequence. The label-centric approach can also be used to
-compare datasets of similar biological origin collected by different labs to ensure that the annotation and the analysis is
-consistent.
+## Introduction
+
+As more and more scRNA-seq datasets become available, carrying merged_seurat
+comparisons between them is key. There are two main approaches to comparing
+scRNASeq datasets. The first approach is "label-centric" which is focused on
+trying to identify equivalent cell-types/states across datasets by comparing
+individual cells or groups of cells. The other approach is "cross-dataset
+normalization" which attempts to computationally remove experiment-specific
+technical/biological effects so that data from multiple experiments can be
+combined and jointly analyzed.
+
+The label-centric approach can be used with dataset with high-confidence
+cell-annotations, e.g. the Human Cell Atlas (HCA) [@Regev2017-mw] or the Tabula
+Muris [@Quake2017] once they are completed, to project cells or clusters from a
+new sample onto this reference to consider tissue composition and/or identify
+cells with novel/unknown identity. Conceptually, such projections are similar to
+the popular BLAST method [@Altschul1990-ts], which makes it possible to quickly
+find the closest match in a database for a newly identified nucleotide or amino
+acid sequence. The label-centric approach can also be used to compare datasets
+of similar biological origin collected by different labs to ensure that the
+annotation and the analysis is consistent.
 
 ```{r, echo=FALSE, merged_seurat.width = '80%', fig.cap="Label-centric dataset comparison can be used to compare the annotations of two different samples."}
 knitr::include_graphics("figures/CourseCompareTypes.png")
@@ -44,77 +50,111 @@ knitr::include_graphics("figures/CourseCompareTypes.png")
 knitr::include_graphics("figures/CourseAtlasAssignment.png")
 ```
 
-The cross-dataset normalization approach can also be used to compare datasets of similar biological origin, unlike the label-centric approach it enables the join analysis of multiple datasets to facilitate the identification of rare cell-types which may to too sparsely sampled in each individual dataset to be reliably detected. However, cross-dataset normalization is not applicable to very large and diverse references since it assumes a significant portion of the biological variablility in each of the datasets overlaps with others.
+The cross-dataset normalization approach can also be used to compare datasets of
+similar biological origin, unlike the label-centric approach it enables the join
+analysis of multiple datasets to facilitate the identification of rare
+cell-types which may to too sparsely sampled in each individual dataset to be
+reliably detected. However, cross-dataset normalization is not applicable to
+very large and diverse references since it assumes a significant portion of the
+biological variablility in each of the datasets overlaps with others.
 
 
 ```{r, echo=FALSE, merged_seurat.width = '80%', fig.cap="Cross-dataset normalization enables joint-analysis of 2+ scRNASeq datasets."}
 knitr::include_graphics("figures/CourseCrossNorm.png")
 ```
-### Datasets
+## Datasets
+
+We will running these methods on two human pancreas datasets: [@Muraro2016-yk]
+and [@Segerstolpe2016-wc]. Since the pancreas has been widely studied, these
+datasets are well annotated.
 
-We will running these methods on two human pancreas datasets: [@Muraro2016-yk] and [@Segerstolpe2016-wc]. Since the pancreas has been widely studied, these datasets are well annotated.
 ```{r}
 muraro <- readRDS("data/pancreas/muraro.rds")
 segerstolpe <- readRDS("data/pancreas/segerstolpe.rds")
 ```
 
-This data has already been formatted for scmap. Cell type labels must be stored in the `cell_type1` column of the `colData` slots, and gene ids that are consistent across both datasets must be stored in the `feature_symbol` column of the `rowData` slots.
+This data has already been formatted for scmap. Cell type labels must be stored
+in the `cell_type1` column of the `colData` slots, and gene ids that are
+consistent across both datasets must be stored in the `feature_symbol` column of
+the `rowData` slots.
 
 First, lets check our gene-ids match across both datasets:
+
 ```{r}
 sum(rowData(muraro)$feature_symbol %in% rowData(segerstolpe)$feature_symbol)/nrow(muraro)
 sum(rowData(segerstolpe)$feature_symbol %in% rowData(muraro)$feature_symbol)/nrow(segerstolpe)
 ```
-Here we can see that 96% of the genes present in muraro match genes in segerstople and 72% of genes in segerstolpe are match
-genes in muraro. This is as expected because the segerstolpe dataset was more deeply sequenced than the muraro dataset.
-However, it highlights some of the difficulties in comparing scRNASeq datasets.
+
+Here we can see that 96% of the genes present in muraro match genes in
+segerstople and 72% of genes in segerstolpe are match genes in muraro. This is
+as expected because the segerstolpe dataset was more deeply sequenced than the
+muraro dataset. However, it highlights some of the difficulties in comparing
+scRNASeq datasets.
 
 We can confirm this by checking the overall size of these two datasets.
+
 ```{r}
 dim(muraro)
 dim(segerstolpe)
 ```
 
-In addition, we can check the cell-type annotations for each of these dataset using the command below:
+In addition, we can check the cell-type annotations for each of these dataset
+using the command below:
 
 ```{r}
 summary(factor(colData(muraro)$cell_type1))
 summary(factor(colData(segerstolpe)$cell_type1))
 ```
-Here we can see that even though both datasets considered the same biological tissue the two datasets, they have been
-annotated with slightly different sets of cell-types. If you are familiar withpancreas biology you might recognize
-that the pancreatic stellate cells (PSCs) in segerstolpe are a type of mesenchymal stem cell which would fall under
-the "mesenchymal" type in muraro. However, it isn't clear whether these two annotations should be considered synonymous
-or not. We can use label-centric comparison methods to determine if these two cell-type annotations are indeed equivalent.
-
-Alternatively, we might be interested in understanding the function of those cells that were "unclassified endocrine" or were deemed
-too poor quality ("not applicable") for the original clustering in each dataset by leveraging in formation across datasets. Either
-we could attempt to infer which of the existing annotations they most likely belong to using label-centric approaches or we could try
-to uncover a novel cell-type among them (or a sub-type within the existing annotations) using cross-dataset normalization.
 
-To simplify our demonstration analyses we will remove the small classes of unassigned cells, and the poor quality cells. We will retain the "unclassified endocrine" to
-see if any of these methods can elucidate what cell-type they belong to.
+Here we can see that even though both datasets considered the same biological
+tissue, they have been annotated with slightly different sets of cell-types. If
+you are familiar with pancreas biology you might recognize that the pancreatic
+stellate cells (PSCs) in `segerstolpe` are a type of mesenchymal stem cell which
+would fall under the "mesenchymal" type in `muraro`. However, it isn't clear
+whether these two annotations should be considered synonymous or not. We can use
+label-centric comparison methods to determine if these two cell-type annotations
+are indeed equivalent.
+
+Alternatively, we might be interested in understanding the function of those
+cells that were "unclassified endocrine" or were deemed too poor quality ("not
+applicable") for the original clustering in each dataset by leveraging in
+formation across datasets. Either we could attempt to infer which of the
+existing annotations they most likely belong to using label-centric approaches
+or we could try to uncover a novel cell-type among them (or a sub-type within
+the existing annotations) using cross-dataset normalization.
+
+To simplify our demonstration analyses we will remove the small classes of
+unassigned cells, and the poor quality cells. We will retain the "unclassified
+endocrine" to see if any of these methods can elucidate what cell-type they
+belong to.
 
 ```{r}
 segerstolpe <- segerstolpe[,colData(segerstolpe)$cell_type1 != "unclassified"]
 segerstolpe <- segerstolpe[,colData(segerstolpe)$cell_type1 != "not applicable",]
 muraro <- muraro[,colData(muraro)$cell_type1 != "unclear"]
 ```
-### Projecting cells onto annotated cell-types (scmap)
+
+## Projecting cells onto annotated cell-types (scmap)
 
 ```{r, echo=TRUE, message=FALSE, warning=FALSE}
 library(scmap)
 set.seed(1234567)
 ```
 
-We recently developed `scmap` [@Kiselev2017-nb] - a method for projecting cells from a scRNA-seq experiment onto the cell-types identified in other experiments. Additionally, a cloud version of `scmap` can be run for free, withmerged_seurat restrictions, from [http://www.hemberg-lab.cloud/scmap](http://www.hemberg-lab.cloud/scmap).
+Martin Hemberg and colleagues recently developed `scmap` [@Kiselev2017-nb]---a
+method for projecting cells from a scRNA-seq experiment onto the cell-types
+identified in other experiments. Additionally, a cloud version of `scmap` can be
+run for free, without restrictions, from
+[http://www.hemberg-lab.cloud/scmap](http://www.hemberg-lab.cloud/scmap).
 
 
 #### Feature Selection
-Once we have a `SingleCellExperiment` object we can run `scmap`. First we have to build the "index" of our reference
-clusters. Since we want to know whether PSCs and mesenchymal cells are synonymous we will project each dataset to the
-other so we will build an index for each dataset. This requires first selecting the most informative features for the
-reference dataset.
+
+Once we have a `SingleCellExperiment` object we can run `scmap`. First we have
+to build the "index" of our reference clusters. Since we want to know whether
+PSCs and mesenchymal cells are synonymous we will project each dataset to the
+other so we will build an index for each dataset. This requires first selecting
+the most informative features for the reference dataset.
 
 ```{r}
 muraro <- selectFeatures(muraro, suppress_plot = FALSE)
@@ -125,7 +165,9 @@ Genes highlighted with the red colour will be used in the futher analysis (proje
 ```{r}
 segerstolpe <- selectFeatures(segerstolpe, suppress_plot = FALSE)
 ```
-From the y-axis of these plots we can see that scmap uses a dropmerged_seurat-based feature selection method.
+
+From the y-axis of these plots we can see that scmap uses a
+dropmerged_seurat-based feature selection method.
 
 Now calculate the cell-type index:
 ```{r}
@@ -134,14 +176,17 @@ segerstolpe <- indexCluster(segerstolpe)
 ```
 
 We can also visualize the index:
+
 ```{r}
 heatmap(as.matrix(metadata(muraro)$scmap_cluster_index))
 ```
 
-You may want to adjust your features using the `setFeatures` function if features are too heavily concentrated in only a few cell-types. In this case the dropmerged_seurat-based features look good so we will just them.
+You may want to adjust your features using the `setFeatures` function if
+features are too heavily concentrated in only a few cell-types. In this case the
+dropout-based features look good so we will just them.
 
-__Exercise__
-Using the rowData of each dataset how many genes were selected as features in both datasets? What does this tell you abmerged_seurat these datasets?
+__Exercise__ Using the rowData of each dataset how many genes were selected as
+features in both datasets? What does this tell you about these datasets?
 
 __Answer__
 ```{r, include=FALSE, eval=FALSE}
@@ -149,11 +194,17 @@ ans = sum(rowData(muraro)[rowData(muraro)$scmap_features, "feature_symbol"] %in%
 print(ans)
 ```
 
-#### Projecting
+### Projecting
 
-scmap computes the distance from each cell to each cell-type in the reference index, then applies an empirically derived threshold to determine which cells are assigned to the closest reference cell-type and which are unassigned. To account for differences in sequencing depth distance is calculated using the spearman correlation and cosine distance and only cells with a consistent assignment with both distances are returned as assigned.
+scmap computes the distance from each cell to each cell-type in the reference
+index, then applies an empirically derived threshold to determine which cells
+are assigned to the closest reference cell-type and which are unassigned. To
+account for differences in sequencing depth distance is calculated using the
+spearman correlation and cosine distance and only cells with a consistent
+assignment with both distances are returned as assigned.
 
 We will project the `segerstolpe` dataset to `muraro` dataset:
+
 ```{r, warning=FALSE}
 seger_to_muraro <- scmapCluster(
   projection = segerstolpe,
@@ -173,39 +224,54 @@ muraro_to_seger <- scmapCluster(
   )
 )
 ```
-Note that in each case we are projecting to a single dataset but that this could be extended to any number of datasets for which we have computed indices.
+
+Note that in each case we are projecting to a single dataset but that this could
+be extended to any number of datasets for which we have computed indices.
 
 Now lets compare the original cell-type labels with the projected labels:
+
 ```{r}
 table(colData(muraro)$cell_type1, muraro_to_seger$scmap_cluster_labs)
 ```
-Here we can see that cell-types do map to their equivalents in segerstolpe, and importantly we see that all but one of the "mesenchymal" cells were assigned to the "PSC" class.
+
+Here we can see that cell-types do map to their equivalents in segerstolpe, and
+importantly we see that all but one of the "mesenchymal" cells were assigned to
+the "PSC" class.
 
 ```{r}
 table(colData(segerstolpe)$cell_type1, seger_to_muraro$scmap_cluster_labs)
 ```
-Again we see cell-types match each other and that all but one of the "PSCs" match the "mesenchymal" cells providing strong evidence that these two annotations should be considered synonymous.
 
-We can also visualize these tables using a [Sankey diagram](https://developers.google.com/chart/interactive/docs/gallery/sankey):
+Again we see cell-types match each other and that all but one of the "PSCs"
+match the "mesenchymal" cells providing strong evidence that these two
+annotations should be considered synonymous.
+
+We can also visualize these tables using a [Sankey
+diagram](https://developers.google.com/chart/interactive/docs/gallery/sankey):
+
 ```{r results='asis', tidy=FALSE}
 plot(getSankey(colData(muraro)$cell_type1,  muraro_to_seger$scmap_cluster_labs[,1], plot_height=400))
 ```
 
-__Exercise__
-How many of the previously unclassified cells would be be able to assign to cell-types using scmap?
+__Exercise__ How many of the previously unclassified cells would be be able to
+assign to cell-types using scmap?
 
 __Answer__
 ```{r, include=FALSE}
 t2 <- table(colData(segerstolpe)$cell_type1, seger_to_muraro$scmap_cluster_labs)
 seger_assigned <- sum(t2[c("unclassified endocrine"), 1:9])
 ```
-### Cell-to-Cell mapping
 
-scmap can also project each cell in one dataset to its approximate closest neighbouring cell in the reference dataset. This
-uses a highly optimized search algorithm allowing it to be scaled to very large references (in theory 100,000-millions of
-cells). However, this process is stochastic so we must fix the random seed to ensure we can reproduce our results.
+## Cell-to-Cell mapping
 
-We have already performed feature selection for this dataset so we can go straight to building the index.
+`scmap` can also project each cell in one dataset to its approximate closest
+neighbouring cell in the reference dataset. This uses a highly optimized search
+algorithm allowing it to be scaled to very large references (in theory
+100,000--millions of cells). However, this process is stochastic so we must fix
+the random seed to ensure we can reproduce our results.
+
+We have already performed feature selection for this dataset so we can go
+straight to building the index.
 
 ```{r}
 set.seed(193047)
@@ -213,17 +279,21 @@ segerstolpe <- indexCell(segerstolpe)
 muraro <- indexCell(muraro)
 ```
 
-In this case the index is a series of clusterings of each cell using different sets of features, parameters k and M are the
-number of clusters and the number of features used in each of these subclusterings. New cells are assigned to the nearest
- cluster in each subclustering to generate unique pattern of cluster assignments. We then find the cell in the reference
-dataset with the same or most similar pattern of cluster assignments.
+In this case the index is a series of clusterings of each cell using different
+sets of features, parameters k and M are the number of clusters and the number
+of features used in each of these subclusterings. New cells are assigned to the
+nearest cluster in each subclustering to generate unique pattern of cluster
+assignments. We then find the cell in the reference dataset with the same or
+most similar pattern of cluster assignments.
 
 We can examine the cluster assignment patterns for the reference datasets using:
+
 ```{r}
 metadata(muraro)$scmap_cell_index$subclusters[1:5,1:5]
 ```
 
 To project and find the `w` nearest neighbours we use a similar command as before:
+
 ```{r}
 muraro_to_seger <- scmapCell(
   projection = muraro,
@@ -235,98 +305,100 @@ muraro_to_seger <- scmapCell(
 ```
 
 We can again look at the results:
+
 ```{r}
 muraro_to_seger$seger[[1]][,1:5]
 ```
-This shows the column number of the 5 nearest neighbours in segerstolpe to each of the cells in muraro. We could then calculate a pseudotime estimate, branch assignment, or other cell-level data by selecting the appropriate data from the colData of
-the segerstolpe data set. As a demonstration we will find the cell-type of the nearest neighbour of each cell.
+
+This shows the column number of the 5 nearest neighbours in segerstolpe to each
+of the cells in muraro. We could then calculate a pseudotime estimate, branch
+assignment, or other cell-level data by selecting the appropriate data from the
+colData of the segerstolpe data set. As a demonstration we will find the
+cell-type of the nearest neighbour of each cell.
+
 ```{r}
 cell_type_NN <- colData(segerstolpe)$cell_type1[muraro_to_seger$seger[[1]][1,]]
 head(cell_type_NN)
 ```
 
-### Metaneighbour
+__Exercise:__ How well does the cell-to-cell mapping work, in your reckoning?
+How many cell type annotations agree/disagree between the top nearest-neighbour
+cells?
 
-[Metaneighbour](https://www.biorxiv.org/content/early/2017/06/16/150524) is specifically designed to ask whether cell-type labels are consistent across datasets. It comes in two
-versions. First is a fully supervised method which assumes cell-types are known in all datasets and calculates how "good" those cell-type labels are. (The precise meaning of "good" will be described below). Alternatively, metaneighbour can estimate how similar all cell-types are to each other both within and across datasets. We will only be using the unsupervised version as it has much more general applicability and is easier to interpret the results of.
 
-Metaneighbour compares cell-types across datasets by building a cell-cell spearman correlation network. The method then tries to predict the label of each cell through weighted "votes" of its nearest-neighbours. Then scores the overall similarity between two clusters as the AUROC for assigning cells of typeA to typeB based on these weighted votes. AUROC of 1 would indicate all the cells of typeA were assigned to typeB before any other cells were, and an AUROC of 0.5 is what you would get if cells were being randomly assigned.
+## mnnCorrect
 
-Metanighbour is just a couple of R functions not a complete package so we have to load them using `source`
-```{r}
-source("course_files/utils/2017-08-28-runMN-US.R")
-```
-#### Prepare Data
-
-Metaneighbour requires all datasets to be combined into a single expression matrix prior to running:
-
-```{r}
-is.common <- rowData(muraro)$feature_symbol %in% rowData(segerstolpe)$feature_symbol
-muraro <- muraro[is.common,]
-segerstolpe <- segerstolpe[match(rowData(muraro)$feature_symbol, rowData(segerstolpe)$feature_symbol),]
-rownames(segerstolpe) <- rowData(segerstolpe)$feature_symbol
-rownames(muraro) <- rowData(muraro)$feature_symbol
-identical(rownames(segerstolpe), rownames(muraro))
+[mnnCorrect](https://www.biorxiv.org/content/early/2017/07/18/165118) corrects
+datasets to facilitate joint analysis. It order to account for differences in
+composition between two replicates or two different experiments it first matches
+invidual cells across experiments to find the overlaping biologicial structure.
+Using that overlap it learns which dimensions of expression correspond to the
+biological state and which dimensions correspond to batch/experiment effect;
+mnnCorrect assumes these dimensions are orthologal to each other in high
+dimensional expression space. Finally it removes the batch/experiment effects
+from the entire expression matrix to return the corrected matrix.
 
-combined_logcounts <- cbind(logcounts(muraro), logcounts(segerstolpe))
-dataset_labels <- rep(c("m", "s"), times=c(ncol(muraro), ncol(segerstolpe)))
-cell_type_labels <- c(colData(muraro)$cell_type1, colData(segerstolpe)$cell_type1)
+To match individual cells to each other across datasets, mnnCorrect uses the
+cosine distance to avoid library-size effect then identifies mututal nearest
+neighbours (`k` determines to neighbourhood size) across datasets. Only
+overlaping biological groups should have mutual nearest neighbours (see panel b
+below). However, this assumes that k is set to approximately the size of the
+smallest biological group in the datasets, but a k that is too low will identify
+too few mutual nearest-neighbour pairs to get a good estimate of the batch
+effect we want to remove.
 
-pheno <- data.frame(Sample_ID = colnames(combined_logcounts),
-                Study_ID=dataset_labels,
-                Celltype=paste(cell_type_labels, dataset_labels, sep="-"))
-rownames(pheno) <- colnames(combined_logcounts)
-```
+Learning the biological/technical effects is done with either singular value
+decomposition, similar to RUV we encounters in the batch-correction section, or
+with principal component analysis with the opitimized irlba package, which
+should be faster than SVD. The parameter `svd.dim` specifies how many dimensions
+should be kept to summarize the biological structure of the data, we will set it
+to three as we found three major groups using Metaneighbor above. These
+estimates may be futher adjusted by smoothing (`sigma`) and/or variance
+adjustment (`var.adj`).
 
-Metaneighbor includes a feature selection method to identify highly variable genes.
+mnnCorrect also assumes you've already subset your expression matricies so that
+they contain identical genes in the same order, so we need to ensure that is the
+case before proceeding with mnnCorrect.
 
-```{r}
-var.genes = get_variable_genes(combined_logcounts, pheno)
-```
+_Note:_ The analysis presented below is relatively time and memory consuming. As
+such, we have not run these commands when compiling this website and therefore
+no output is shown here. Approach with caution if you are trying to run this
+analysis on a laptop!
 
-Since Metaneighbor is much slower than `scmap`, we will down sample these datasets.
 
-```{r}
-subset <- sample(1:nrow(pheno), 2000)
-combined_logcounts <- combined_logcounts[,subset]
-pheno <- pheno[subset,]
-cell_type_labels <- cell_type_labels[subset]
-dataset_labels <- dataset_labels[subset]
+```{r, echo=FALSE, merged_seurat.width = '80%', fig.cap="mnnCorrect batch/dataset effect correction. From Haghverdi et al. 2017"}
+knitr::include_graphics("figures/mnnCorrectDiagramCropped.png")
 ```
 
-Now we are ready to run Metaneighbor. First we will run the unsupervised
-version that will let us see which cell-types are most similar across the
-two datasets.
-
 ```{r}
-unsup <- run_MetaNeighbor_US(var.genes, combined_logcounts, unique(pheno$Celltype), pheno)
-heatmap(unsup)
+is.common <- rowData(muraro)$feature_symbol %in% rowData(segerstolpe)$feature_symbol
+muraro <- muraro[is.common,]
+segerstolpe <- segerstolpe[match(rowData(muraro)$feature_symbol, rowData(segerstolpe)$feature_symbol),]
+rownames(segerstolpe) <- rowData(segerstolpe)$feature_symbol
+rownames(muraro) <- rowData(muraro)$feature_symbol
+identical(rownames(segerstolpe), rownames(muraro))
 ```
 
-### mnnCorrect
-[mnnCorrect](https://www.biorxiv.org/content/early/2017/07/18/165118) corrects datasets to facilitate joint analysis. It order to account for differences in composition between two replicates or two different experiments it first matches invidual cells across experiments to find the overlaping biologicial structure. Using that overlap it learns which dimensions of expression correspond to the biological state and which dimensions correspond to batch/experiment effect; mnnCorrect assumes these dimensions are orthologal to each other in high dimensional expression space. Finally it removes the batch/experiment effects from the entire expression matrix to return the corrected matrix.
-
-To match individual cells to each other across datasets, mnnCorrect uses the cosine distance to avoid library-size effect then identifies mututal nearest neighbours (`k` determines to neighbourhood size) across datasets. Only overlaping biological groups should have mutual nearest neighbours (see panel b below). However, this assumes that k is set to approximately the size of the smallest biological group in the datasets, but a k that is too low will identify too few mutual nearest-neighbour pairs to get a good estimate of the batch effect we want to remove.
-
-Learning the biological/techncial effects is done with either singular value decomposition, similar to RUV we encounters in the batch-correction section, or with principal component analysis with the opitimized irlba package, which should be faster than SVD. The parameter `svd.dim` specifies how many dimensions should be kept to summarize the biological structure of the data, we will set it to three as we found three major groups using Metaneighbor above. These estimates may be futher adjusted by smoothing (`sigma`) and/or variance adjustment (`var.adj`).
-
-mnnCorrect also assumes you've already subset your expression matricies so that they contain identical genes in the same order, fortunately we have already done with for our datasets when we set up our data for Metaneighbor.
-
-```{r, echo=FALSE, merged_seurat.width = '80%', fig.cap="mnnCorrect batch/dataset effect correction. From Haghverdi et al. 2017"}
-knitr::include_graphics("figures/mnnCorrectDiagramCropped.png")
-```
 ```{r, eval=FALSE}
-require("batchelor")
+library("batchelor")
+identical(rownames(muraro), rownames(segerstolpe))
 # mnnCorrect will take several minutes to run
-corrected <- mnnCorrect(logcounts(muraro), logcounts(segerstolpe), k=20, sigma=1, pc.approx=TRUE, subset.row=var.genes, svd.dim=3)
+corrected <- mnnCorrect(logcounts(muraro), logcounts(segerstolpe), k=20)
 ```
-First let's check that we found a sufficient number of mnn pairs, mnnCorrect returns a list of dataframe with the mnn pairs for each dataset.
+
+First let's check that we found a sufficient number of mnn pairs, mnnCorrect
+returns a list of dataframe with the mnn pairs for each dataset.
+
 ```{r, eval=FALSE}
 dim(corrected$pairs[[1]]) # muraro -> others
 dim(corrected$pairs[[2]]) # seger -> others
 ```
-The first and second columns contain the cell column IDs and the third column contains a number indicating which dataset/batch the column 2 cell belongs to. In our case,
-we are only comparing two datasets so all the mnn pairs have been assigned to the second table and the third column contains only ones
+
+The first and second columns contain the cell column IDs and the third column
+contains a number indicating which dataset/batch the column 2 cell belongs to.
+In our case, we are only comparing two datasets so all the mnn pairs have been
+assigned to the second table and the third column contains only ones
+
 ```{r, eval=FALSE}
 head(corrected$pairs[[2]])
 total_pairs <- nrow(corrected$pairs[[2]])
@@ -355,7 +427,7 @@ summary(factor(colData(muraro)$cell_type1[muraro_matched_cells]))
 Now we could create a combined dataset to jointly analyse these data. However, the corrected data is no longer counts and usually will contain negative expression values thus some analysis tools may no longer be appropriate. For simplicity let's just plot a joint TSNE.
 
 ```{r, eval=FALSE}
-require("Rtsne")
+library("Rtsne")
 joint_expression_matrix <- cbind(corrected$corrected[[1]], corrected$corrected[[2]])
 
 # Tsne will take some time to run on the full dataset
@@ -366,15 +438,32 @@ cell_type_labels <- factor(c(colData(muraro)$cell_type1, colData(segerstolpe)$ce
 plot(joint_tsne$Y[,1], joint_tsne$Y[,2], pch=c(16,1)[dataset_labels], col=rainbow(length(levels(cell_type_labels)))[cell_type_labels])
 ```
 
-### Cannonical Correlation Analysis (Seurat)
+## "Anchor" integration (Seurat)
 
-The Seurat package contains another correction method for combining multiple datasets, called [CCA](https://www.biorxiv.org/content/early/2017/07/18/164889). However, unlike mnnCorrect it doesn't correct the expression matrix itself directly. Instead Seurat finds a lower dimensional subspace for each dataset then corrects these subspaces. Also different from mnnCorrect, Seurat only combines a single pair of datasets at a time.
+The Seurat package originally proposed another correction method for combining
+multiple datasets, called
+[CCA](https://www.biorxiv.org/content/early/2017/07/18/164889). However, unlike
+mnnCorrect it doesn't correct the expression matrix itself directly. Instead
+Seurat finds a lower dimensional subspace for each dataset then corrects these
+subspaces. Also different from mnnCorrect, Seurat could only combine a single
+pair of datasets at a time with CCA.
 
-Seurat uses gene-gene correlations to identify the biological structure in the dataset with a method called canonical correlation analysis (CCA). Seurat learns the shared structure to the gene-gene correlations and then evaluates how well each cell fits this structure. Cells which must better described by a data-specific dimensionality reduction method than by the shared correlation structure are assumed to represent dataset-specific cell-types/states and are discarded before aligning the two datasets. Finally the two datasets are aligned using 'warping' algorithms which normalize the low-dimensional representations of each dataset in a way that is robust to differences in population density.
+Recently, the Seurat team proposed a new method to integrate single-cell
+datasets using "anchors" [@Stuart2019-zx].
+
+```{r, echo=FALSE, merged_seurat.width = '80%', fig.cap="Seurat integration with anchors."}
+knitr::include_graphics("figures/stuart_graphical_abstract.jpg")
+```
 
-Note because Seurat uses up a lot of library space you will have to restart your R-session to load it, and the plots/output won't be automatically generated on this page.
+_Note:_ because Seurat uses up a lot of library space you may need to restart
+your R-session to load it, and the plots/output won't be automatically generated
+on this page. Also the analysis presented below is relatively time and memory
+consuming. As such, we have not run these commands when compiling this website
+and therefore no output is shown here. Approach with caution if you are trying
+to run this analysis on a laptop!
 
 Reload the data:
+
 ```{r, eval=FALSE}
 muraro <- readRDS("data/pancreas/muraro.rds")
 segerstolpe <- readRDS("data/pancreas/segerstolpe.rds")
@@ -389,118 +478,243 @@ rownames(muraro) <- rowData(muraro)$feature_symbol
 identical(rownames(segerstolpe), rownames(muraro))
 ```
 
-
 First we will reformat our data into Seurat objects:
 
 ```{r, eval=FALSE}
-require("Seurat")
+library("Seurat")
+library(scater)
+library(SingleCellExperiment)
 set.seed(4719364)
-muraro_seurat <- CreateSeuratObject(raw.data=assays(muraro)[["normcounts"]]) # raw counts aren't available for muraro
+muraro_seurat <- CreateSeuratObject(normcounts(muraro)) # raw counts aren't available for muraro
 muraro_seurat@meta.data[, "dataset"] <- 1
 muraro_seurat@meta.data[, "celltype"] <- paste("m",colData(muraro)$cell_type1, sep="-")
 
-seger_seurat <- CreateSeuratObject(raw.data=assays(segerstolpe)[["counts"]])
+seger_seurat <- CreateSeuratObject(counts(segerstolpe))
 seger_seurat@meta.data[, "dataset"] <- 2
 seger_seurat@meta.data[, "celltype"] <- paste("s",colData(segerstolpe)$cell_type1, sep="-")
 ```
-Next we must normalize, scale and identify highly variable genes for each dataset:
+
+Next we must normalize, scale and identify highly variable genes for each
+dataset:
+
 ```{r, eval=FALSE}
 muraro_seurat <- NormalizeData(object=muraro_seurat)
 muraro_seurat <- ScaleData(object=muraro_seurat)
-muraro_seurat <- FindVariableGenes(object=muraro_seurat, do.plot=TRUE)
+muraro_seurat <- FindVariableFeatures(object=muraro_seurat)
 
 seger_seurat <- NormalizeData(object=seger_seurat)
 seger_seurat <- ScaleData(object=seger_seurat)
-seger_seurat <- FindVariableGenes(object=seger_seurat, do.plot=TRUE)
+seger_seurat <- FindVariableFeatures(object=seger_seurat)
 ```
 
-```{r, echo=FALSE, merged_seurat.width = '80%', fig.cap="muraro variable genes"}
-knitr::include_graphics("figures/muraro_seurat_hvg.png")
+Even though Seurat corrects for the relationship between dispersion and mean
+expression, it doesn't use the corrected value when ranking features. Compare
+the results of the command below with the results in the plots above:
+
+```{r, eval=FALSE}
+head(VariableFeatures(muraro_seurat), 50)
+head(VariableFeatures(seger_seurat), 50)
 ```
-```{r, echo=FALSE, merged_seurat.width = '80%', fig.cap="segerstolpe variable genes"}
-knitr::include_graphics("figures/seger_seurat_hvg.png")
+
+Next, we use the `SelectIntegrationFeatures` function to select with genes to
+use to integrate the two datasets here.
+
+```{r, eval=FALSE}
+seurat_list <- list(muraro = muraro_seurat, seger = seger_seurat)
+features <- SelectIntegrationFeatures(object.list = seurat_list)
+seurat_list <- lapply(X = seurat_list, FUN = function(x) {
+  x <- ScaleData(x, features = features, verbose = FALSE)
+  x <- RunPCA(x, features = features, verbose = FALSE)
+})
 ```
-Eventhough Seurat corrects for the relationship between dispersion and mean expression, it doesn't use the corrected value when ranking features. Compare the results of the command below with the results in the plots above:
+
+Now we will run `FindIntegrationAnchors` and then `IntegrateData` to integrate
+the datasets:
 
 ```{r, eval=FALSE}
-head(muraro_seurat@hvg.info, 50)
-head(seger_seurat@hvg.info, 50)
+pancreas.anchors <- FindIntegrationAnchors(object.list = seurat_list, dims = 1:30)
+
+pancreas.integrated <- IntegrateData(anchorset = pancreas.anchors, dims = 1:30)
 ```
 
-But we will follow their example and use the top 2000 most dispersed genes withmerged_seurat correcting for mean expression from each dataset anyway.
+To view the output of the integration we could run the following plotting
+commands.
 
 ```{r, eval=FALSE}
-gene.use <- union(rownames(x = head(x = muraro_seurat@hvg.info, n = 2000)),
-                  rownames(x = head(x = seger_seurat@hvg.info, n = 2000)))
+library(ggplot2)
+library(cowplot)
+                                        # switch to integrated assay. The variable features of this assay are automatically
+                                        # set during IntegrateData
+DefaultAssay(pancreas.integrated) <- "integrated"
+
+                                        # Run the standard workflow for visualization and clustering
+pancreas.integrated <- ScaleData(pancreas.integrated, verbose = FALSE)
+pancreas.integrated <- RunPCA(pancreas.integrated, npcs = 30, verbose = FALSE)
+pancreas.integrated <- RunUMAP(pancreas.integrated, reduction = "pca", dims = 1:30)
+p1 <- DimPlot(pancreas.integrated, reduction = "umap", group.by = "dataset")
+p2 <- DimPlot(pancreas.integrated, reduction = "umap", group.by = "celltype", label = TRUE, 
+              repel = TRUE) + NoLegend()
+plot_grid(p1, p2)
 
 ```
 
-__Exercise__
-Find the features we would use if we selected the top 2000 most dispersed after scaling by mean. (Hint: consider the `order` function)
+__Q:__ How well do you think this approach integrates these two pancreatic datasets?
+
+__Advanced Exercise__
+Use the clustering methods we previously covered on the combined datasets. Do you identify any novel cell-types?
+
+__For more details on using Seurat for this type of analysis, please consult the
+[relevant vignettes](https://satijalab.org/seurat/v3.1/integration.html) and
+documentation__
 
-__Answer__
-```{r, include=FALSE, eval=FALSE}
-seger_hvg <- seger_seurat@hvg.info
-seger_hvg <- seger_hvg[order(seger_hvg[,3], decreasing=TRUE),]
-muraro_hvg <- seger_seurat@hvg.info
-muraro_hvg <- muraro_hvg[order(muraro_hvg[,3], decreasing=TRUE),]
 
-gene.use.scaled <- union(rownames(seger_hvg[1:2000,]), rownames(muraro_hvg[1:2000,]))
+## Search scRNA-Seq data
+
+```{r, echo=TRUE, message=FALSE, warning=FALSE}
+library(scfind)
+library(SingleCellExperiment)
+library(plotly)
+set.seed(1234567)
 ```
 
-Now we will run CCA to find the shared correlation structure for these two datasets:
+### About 
 
-Note to speed up the calculations we will be using only the top 5 dimensions but ideally you would consider many more and then select the top most informative ones using `DimHeatmap`.
+`scfind` is a tool that allows one to search single cell RNA-Seq collections
+(Atlas) using lists of genes, e.g. searching for cells and cell-types where a
+specific set of genes are expressed. `scfind` is a [Github
+package](https://github.com/hemberg-lab/scfind). Cloud implementation of
+`scfind` with a large collection of datasets is available on our
+[website](https://scfind.sanger.ac.uk/).
 
-```{r, eval=FALSE}
-merged_seurat <- RunCCA(object=muraro_seurat, object2=seger_seurat, genes.use=gene.use, add.cell.id1="m", add.cell.id2="s", num.cc = 5)
-DimPlot(object = merged_seurat, reduction.use = "cca", group.by = "dataset", pt.size = 0.5) # Before correcting
+```{r, echo = FALSE, out.width = '80%', fig.cap="scfind can be used to search large collection of scRNA-seq data by a list of gene IDs."}
+knitr::include_graphics("figures/scfind.png")
 ```
-```{r, echo=FALSE, merged_seurat.width = '80%', fig.cap="Before Aligning"}
-knitr::include_graphics("figures/cca_before.png")
+
+### Dataset
+
+We will run `scfind` on the Tabula Muris 10X dataset. `scfind` also operates on
+`SingleCellExperiment` class:
+
+```{r, message=FALSE, warning=FALSE}
+tm10x_heart <- readRDS("data/sce/Heart_10X.rds")
+tm10x_heart <- tm10x_heart[, !is.na(tm10x_heart$cell_type1)]
+tm10x_heart
+colData(tm10x_heart)
 ```
-To identify dataset specific cell-types we compare how well cells are 'explained' by CCA vs dataset-specific principal component analysis.
 
-```{r, eval=FALSE}
-merged_seurat <- CalcVarExpRatio(object = merged_seurat, reduction.type = "pca", grouping.var = "dataset", dims.use = 1:5)
-merged.all <- merged_seurat
-merged_seurat <- SubsetData(object=merged_seurat, subset.name="var.ratio.pca", accept.low = 0.5) # CCA > 1/2 as good as PCA
-merged.discard <- SubsetData(object=merged.all, subset.name="var.ratio.pca", accept.high = 0.5)
+### Cell type search
+
+If one has a list of genes that you would like to check against you dataset,
+i.e. find the cell types that most likely represent your genes (highest
+expression), then scfind allows one to do that by first creating a gene index
+and then very quickly searching the index.
+
+The input data for indexing is the raw count matrix of the
+`SingleCellExperiment` class. By default the `cell_type1` column of the
+`colData` slot of the `SingleCellExperiment` object is used to define cell
+types, however it can also be defined manually using the `cell.type.label`
+argument of the `buildCellTypeIndex`.
+
 
-summary(factor(merged.discard@meta.data$celltype)) # check the cell-type of the discarded cells.
+```{r}
+heart_index <- buildCellTypeIndex(
+  tm10x_heart,
+  cell_type_column = "cell_type1"
+)
 ```
-Here we can see that despite both datasets containing endothelial cells, almost all of them have been discarded as "dataset-specific". Now we can align the datasets:
 
-```{r, eval=FALSE}
-merged_seurat <- AlignSubspace(object = merged_seurat, reduction.type = "cca", grouping.var = "dataset", dims.align = 1:5)
-DimPlot(object = merged_seurat, reduction.use = "cca.aligned", group.by = "dataset", pt.size = 0.5) # After aligning subspaces
+`scfind` adopts a two-step compression strategy which allows efficient
+compression of large cell-by-gene matrix and allows fast retrieval of data by
+gene query. The authors estimated that one can achieve 2 orders of magnitude
+compression with this method.
+
+
+We can look at the p-values from `scfind` to discover the best matches.
+
+```{r}
+p_values <- -log10(findCellType(heart_index, c("Fstl1")))
+barplot(p_values, ylab = "-log10(pval)", las = 2)
 ```
 
-```{r, echo=FALSE, merged_seurat.width = '80%', fig.cap="After Aligning"}
-knitr::include_graphics("figures/cca_after.png")
+__Exercise:__ Search for some of your favourite genes in the heart data!
+
+__Exercise:__ Repeat this approach to build a cell type index for the thymus
+data.
+
+```{r}
+tm10x_thymus <- readRDS("data/sce/Thymus_10X.rds")
+tm10x_thymus <- tm10x_thymus[, !is.na(tm10x_thymus$cell_type1)]
+thymus_index <- buildCellTypeIndex(
+  tm10x_thymus, 
+  cell_type_column   = "cell_type1"
+)
+
+p_values <- -log10(findCellType(thymus_index, c("Cd74")))
+barplot(p_values, ylab = "-log10(pval)", las = 2)
 ```
 
-__Exercise__
-Compare the results for if you use the features after scaling dispersions.
+__Exercise:__ Search for interesting genes in the thymus data!
 
-__Answer__
-```{r, eval=FALSE, include=FALSE}
-merged_seurat <- RunCCA(object=muraro_seurat, object2=seger_seurat, genes.use=gene.use.scaled, add.cell.id1="m", add.cell.id2="s", num.cc = 5)
-merged_seurat <- AlignSubspace(object = merged_seurat, reduction.type = "cca", grouping.var = "dataset", dims.align = 1:5)
-DimPlot(object = merged_seurat, reduction.use = "cca.aligned", group.by = "dataset", pt.size = 0.5) # After aligning subspaces
+
+### Cell search 
+
+If one is more interested in finding out in which cells all the genes from your
+gene list are expressed in, then you can build a cell index instead of a cell
+type index. buildCellIndex function should be used for building the index and
+findCell for searching the index:
+
+```{r}
+heart_gene_index <- buildCellIndex(tm10x_heart)
+res <- findCell(heart_gene_index, c("Fstl1", "Ppia"))
+head(res$common_exprs_cells)
+barplot(-log10(res$p_values), ylab = "-log10(pval)", las = 2)
 ```
-```{r, echo=FALSE, merged_seurat.width = '80%', fig.cap="After Aligning"}
-knitr::include_graphics("figures/cca_after2.png")
+
+Cell search reports the p-values corresponding to cell types as well.
+
+
+__Exercise:__ Search for some of your favourite genes in the heart data!
+
+__Exercise:__ Repeat this approach to build a cell type index for the thymus
+data.
+
+
+__Advanced exercise:__ Try `scfind` on the Deng data and see if you can find
+interesting results for well-known developmental genes in that mouse embryo
+data.
+
+
+### Explore
+
+__Exercise:__ Play around with the [scfind](https://scfind.sanger.ac.uk)
+website, which allow for searching of several large "atlas"-type datasets. This
+could be a useful way to check the utility or validity of cell-type marker genes
+before use in your own analysis.
+
+For example, you might explore the following cardiac contractility genes in the
+Mouse Atlas or Tabula Muris datasets.
+
+```{r}
+cardiac_contractility <- c("Ace2","Fkbp1b","Gh","Cacna1c","Cd59b","Ppp1r1a","Tnnt2","Nos1","Agtr1a","Camk2g","Grk2","Ins2","Dnah8","Igf1","Nos3","Nppa","Nppb","Il6","Myh6","Ren2","Tnni3","Apln","Kcnmb1","Pik3cg","Prkca","Aplnr","Slc8a1","Ace","Akt1","Edn1","Kcnmb2","Nos2","Tnf","Myh14","Adrb2","Agt","Adrb1","Atp2a2","Ryr2","Pln")
 ```
 
+#### In-silico gating
 
-__Advanced Exercise__
-Use the clustering methods we previously covered on the combined datasets. Do you identify any novel cell-types?
+On the scfind website, you can perform "in-silico gating" using logical
+operators on marker genes to identify cell type subsets similar to the cell
+sorting works. To do so, you can add logical operators including "-" and "*" for
+"no" and "intermediate" expression, respectively in front of the gene name.
 
 
+__Exercise:__ Try out some in silico gating on one or more datasets available at
+the scfind website. For example, can you gate to find regulatory T cells and
+effector memory T cells?
+
 
 ### sessionInfo()
 
-```{r echo=FALSE}
+```{r}
 sessionInfo()
 ```
+
+
diff --git a/course_files/search.Rmd b/course_files/search.Rmd
deleted file mode 100644
index a57f15a..0000000
--- a/course_files/search.Rmd
+++ /dev/null
@@ -1,243 +0,0 @@
----
-output: html_document
----
-
-```{r setup, echo=FALSE}
-knitr::opts_chunk$set(out.width='90%', fig.align = 'center', eval=FALSE)
-knitr::opts_knit$set(root.dir = normalizePath(".."))
-```
-
-## Search scRNA-Seq data
-
-```{r, echo=TRUE, message=FALSE, warning=FALSE}
-library(scfind)
-library(SingleCellExperiment)
-library(plotly)
-set.seed(1234567)
-```
-
-### About 
-
-`scfind` is a tool that allows one to search single cell RNA-Seq collections
-(Atlas) using lists of genes, e.g. searching for cells and cell-types where a
-specific set of genes are expressed. `scfind` is a [Github
-package](https://github.com/hemberg-lab/scfind). Cloud implementation of
-`scfind` with a large collection of datasets is available on our
-[website](https://scfind.sanger.ac.uk/).
-
-```{r, echo = FALSE, out.width = '80%', fig.cap="scfind can be used to search large collection of scRNA-seq data by a list of gene IDs."}
-knitr::include_graphics("figures/scfind.png")
-```
-
-### Dataset
-
-We will run `scfind` on the Tabula Muris 10X dataset. `scfind` also operates on
-`SingleCellExperiment` class:
-
-```{r, message=FALSE, warning=FALSE}
-tm10x_heart <- readRDS("data/sce/Heart_10X.rds")
-tm10x_heart
-colData(tm10x_heart)
-```
-
-### Gene Index
-
-Now we need to create a gene index using our dataset:
-```{r}
-heart_index <- buildCellTypeIndex(
-  tm10x_heart,
-  cell_type_column = "cell_type1"
-)
-```
-
-`scfind` adopts a two-step compression strategy which allows efficient
-compression of large cell-by-gene matrix and allows fast retrieval of data by
-gene query. We estimated that one can achieve 2 orders of magnitude compression
-with this method.
-
-The input matrix for indexing is the raw count matrix of the
-`SingleCellExperiment` class. By default the `cell_type1` column of the
-`colData` slot of the `SingleCellExperiment` object is used to define cell
-types, however it can also be defined manually using the `cell.type.label`
-argument of the `buildCellTypeIndex`. For dataset with more than one tissue, you
-can also merge all tissues together to create a super index using the function
-`mergeDataset`.
-
-The index can be saved in .rds format using `saveObject` function and loaded
-using `loadObject` function for future use.
-
-```{r}
-tm10x_thymus <- readRDS("data/sce/Thymus_10X.rds")
-thymus_index <- buildCellTypeIndex(
-  tm10x_thymus, 
-  cell_type_column   = "cell_type1"
-)
-## scfind_index <- mergeDataset(heart_index, thymus_index)
-## scfind_index@datasets
-## cellTypeNames(scfind_index)
-## sample(scfindGenes(scfind_index),20)
-```
-
-
-To quickly and easily find the enriched cell type using an interactive Shiny
-application use the following method:
-
-```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
-scfindShiny(scfind_index)
-```
-
-
-### Marker genes
-
-Now let's find the marker genes for Thymus T cell in the datasets
-```{r, eval=FALSE}
-# Showing the top 5 marker genes for each cell type and sort by F1 score.
-t_cell_markers <- cellTypeMarkers(scfind_index, cell.types = "Thymus.T cell", top.k = 5, sort.field = "f1") 
-
-t_cell_markers
-```
-
-Next, you can evaluate the markers of Thymus T cell in Thymus stromal cell
-```{r, eval=FALSE}
-evaluateMarkers(
-  scfind_index, 
-  gene.list = as.character(t_cell_markers$genes), 
-  cell.types = "Thymus.stromal cell", 
-  sort.field = "f1"
-)
-```
-
-```{r, eval=FALSE}
-# By default, the marker evaluation takes all cell types in the dataset as background cell type, but you can use the argument `background.cell.types` to fine tune the evaluation
-
-background <- cellTypeNames(scfind_index, datasets = "Thymus")
-background
-
-evaluateMarkers(
-  scfind_index, 
-  gene.list = as.character(t_cell_markers$genes), 
-  cell.types = "Thymus.stromal cell", 
-  sort.field = "f1", 
-  background.cell.types = background
-)
-```
-
-
-### Search cells by a gene list
-
-`scfind` can instantly identify the cell type that best represents the genes of
-interest from large single cell dataset. We will use the marker genes identified
-in an original publication [Yanbin et al.
-2015](https://www.nature.com/articles/cr201599). Cardiomyocyte-specific markers
-used in immunostaining as shown in Figure 1.
-
-```{r, eval=FALSE}
-cardiomyocytes <- c("Mef2c", "Gata4", "Nkx2.5", "Myh6", "tnnt2", "tnni3", "CDH2", "Cx43", "GJA1") 
-result <- markerGenes(
-  scfind_index, 
-  gene.list = cardiomyocytes
-)
-result
-```
-
-
-To allow search of enriched cell type from a long list of gene query, `scfind`
-features a query optimization routine. First, the function `markerGenes` will
-counter suggest subqueries that with the highest support in the dataset. The
-TF-IDF score for each gene set allows user to identify the best subquery for
-finding the most relevant cell type.
-
-```{r, eval=FALSE}
-best_subquery <- result[which.max(result$tfidf),] # get the best subquery by ranking TF-IDF score
-best_subquery <- strsplit(as.character(best_subquery$Query), ",")[[1]] # obtain gene list
-hyperQueryCellTypes(
-  scfind_index,
-  gene.list = best_subquery
-)
-```
-
-`hyperQueryCellTypes` function returns a list of p-values corresponding to all
-cell types in a given dataset. It also outputs a list of cells in which genes
-from the given gene list are co-expressed.
-
-
-__Exercise 1__
-
-Find the marker genes of all cell types in the Heart dataset
-
-```{r, echo=FALSE, eval=FALSE}
-heart_cell_types <- cellTypeNames(scfind_index, "Heart")
-
-for(cell_type_name in heart_cell_types)
-{
-  result <- cellTypeMarkers(
-    scfind_index, 
-    cell.types = cell_type_name, 
-    sort.field = "f1", 
-    background = heart_cell_types) 
-  print(result)
-}
-```
-
-```{r}
-cardiac_contractility <- c("Ace2","Fkbp1b","Gh","Cacna1c","Cd59b","Ppp1r1a","Tnnt2","Nos1","Agtr1a","Camk2g","Grk2","Ins2","Dnah8","Igf1","Nos3","Nppa","Nppb","Il6","Myh6","Ren2","Tnni3","Apln","Kcnmb1","Pik3cg","Prkca","Aplnr","Slc8a1","Ace","Akt1","Edn1","Kcnmb2","Nos2","Tnf","Myh14","Adrb2","Agt","Adrb1","Atp2a2","Ryr2","Pln")
-```
-
-__Exercise 2__
-
-Input the gene list relevant to "cardiac contractility" and find the best gene
-set with the highest support. Identify the enriched cell type for this query.
-
-```{r, echo=FALSE, eval=FALSE}
-result <- markerGenes(scfind_index, cardiac_contractility)
-best_sub_query <- as.character(result[which.max(result$tfidf), c("Query")])
-best_sub_query <- strsplit(best_sub_query, ",")[[1]]
-hyperQueryCellTypes(scfind_index, best_sub_query)
-```
-
-### In-silico gating
-
-Using the `findCellTypes` function, you can perform in-silico gating to identify
-cell type subsets as if the way cell sorting works. To do so, you can add
-logical operators including "-" and "*" for "no" and "intermediate" expression,
-respectively in front of the gene name. Here, we use operators to subset T cell
-of the Thymus dataset into effector T regulatory cells and effector memory T
-cell.
-
-```{r, eval=FALSE}
-effector_t_reg_cells <- c("*Ptprc", "-Il7r", "Ctla4", "-Il7r")
-effector_memory_t_cells <- c("-Il2ra", "*Ptprc", "Il7r")
-
-subset_treg <- findCellTypes(scfind_index, effector_t_reg_cells, "Thymus") 
-subset_tmem <- findCellTypes(scfind_index, effector_memory_t_cells, "Thymus")
-
-subset_treg
-subset_tmem
-```
-
-Let's use the TSNE plot information from the `SingleCellExperiment` of Thymus to
-illustrate the gating result
-
-```{r, eval=FALSE}
-map <- data.frame(
-  tm10x_thymus@reducedDims[['TSNE']], 
-  cell_type = as.character(colData(tm10x_thymus)$cell_type1), 
-  stringsAsFactors = F
-  )
-
-map <- subset(map, cell_type == "T cell") 
-
-plot_ly(map, x = ~X1 , y = ~X2, type="scatter")
-
-map$cell_type[subset_treg$`Thymus.T cell`] <- "Effector T Regulatory Cell"
-map$cell_type[subset_tmem$`Thymus.T cell`] <- "Effector Memory T Cell"
-
-plot_ly(map, x = ~X1 , y = ~X2, type="scatter", color = ~cell_type)
-```
-
-
-### sessionInfo()
-
-```{r}
-sessionInfo()
-```
-- 
GitLab