From 31532ab6e966b4ba662da495d9dd89ef524e1662 Mon Sep 17 00:00:00 2001
From: Davis McCarthy <davismcc@gmail.com>
Date: Tue, 1 Oct 2019 18:54:00 +1000
Subject: [PATCH] Updating feature selection chapter

---
 course_files/feature-selection.Rmd | 317 +++++++++++++++++++----------
 1 file changed, 212 insertions(+), 105 deletions(-)

diff --git a/course_files/feature-selection.Rmd b/course_files/feature-selection.Rmd
index 965e18f..58a4cc4 100644
--- a/course_files/feature-selection.Rmd
+++ b/course_files/feature-selection.Rmd
@@ -4,7 +4,7 @@ output: html_document
 ---
 
 ```{r setup, echo=FALSE}
-knitr::opts_chunk$set(out.width='90%', fig.align = 'center', eval=TRUE)
+knitr::opts_chunk$set(out.width='90%', fig.align = 'center', eval=TRUE, warning=FALSE, message=FALSE)
 knitr::opts_knit$set(root.dir = normalizePath(".."))
 ```
 
@@ -16,6 +16,9 @@ library(matrixStats)
 library(M3Drop)
 library(RColorBrewer)
 library(SingleCellExperiment)
+library(Polychrome)
+library(scater)
+library(scran)
 set.seed(1)
 ```
 
@@ -47,7 +50,8 @@ For this section we will continue working with the Deng data.
 ```{r}
 deng <- readRDS("data/deng/deng-reads.rds")
 celltype_labs <- colData(deng)$cell_type2
-cell_colors <- brewer.pal(max(3,length(unique(celltype_labs))), "Set3")
+cell_colors <- createPalette(10, c("#010101", "#ff0000"), M=1000)
+names(cell_colors) <- unique(as.character(celltype_labs))
 ```
 
 Feature selection is performed after QC, however this data has already been QCed so
@@ -87,12 +91,13 @@ first is to identify genes which behave differently from a null model
 describing just the technical noise expected in the dataset.
 
 If the dataset contains spike-in RNAs they can be used to directly model
-technical noise. However, measurements of spike-ins may not experience
-the same technical noise as endogenous transcripts [(Svensson et al., 2017)](https://www.nature.com/nmeth/journal/v14/n4/full/nmeth.4220.html).
-In addition, scRNASeq experiments often contain only a small number of
-spike-ins which reduces our confidence in fitted model parameters.
+technical noise. However, measurements of spike-ins may not experience the same
+technical noise as endogenous transcripts [(Svensson et al.,
+2017)](https://www.nature.com/nmeth/journal/v14/n4/full/nmeth.4220.html). In
+addition, scRNASeq experiments often contain only a small number of spike-ins
+which reduces our confidence in fitted model parameters.
 
-#### Highly Variable Genes
+#### Highly Variable Genes - Brennecke method
 
 The first method proposed to identify features in scRNASeq datasets
 was to identify highly variable genes (HVG). HVG assumes that if genes
@@ -117,20 +122,22 @@ plot(
     main=""
 )
 ```
-A popular method to correct for the relationship between variance and mean expression
-was proposed by [Brennecke et al.](http://www.nature.com/nmeth/journal/v10/n11/full/nmeth.2645.html).
-To use the Brennecke method, we first normalize for library size then calculate
-the mean and the square coefficient of variation (variation divided by
-the squared mean expression). A quadratic curve is fit to the relationship
-between these two variables for the ERCC spike-in, and then a chi-square test is used to find genes
-significantly above the curve. This method is included in the M3Drop package as the
-Brennecke_getVariableGenes(counts, spikes) function. However, this dataset does not contain spike-ins
-so we will use the entire dataset to estimate the technical noise.
-
-In the figure below the red curve
-is the fitted technical noise model and the dashed line is the 95%
-CI. Pink dots are the genes with significant biological variability
-after multiple-testing correction.
+
+An early method to correct for the relationship between variance and mean
+expression was proposed by [Brennecke et
+al.](http://www.nature.com/nmeth/journal/v10/n11/full/nmeth.2645.html). To use
+the Brennecke method, we first normalize for library size then calculate the
+mean and the square coefficient of variation (variation divided by the squared
+mean expression). A quadratic curve is fit to the relationship between these two
+variables for the ERCC spike-in, and then a chi-square test is used to find
+genes significantly above the curve. This method is included in the M3Drop
+package as the Brennecke_getVariableGenes(counts, spikes) function. However,
+this dataset does not contain spike-ins so we will use the entire dataset to
+estimate the technical noise.
+
+In the figure below the red curve is the fitted technical noise model and the
+dashed line is the 95% CI. Pink dots are the genes with significant biological
+variability after multiple-testing correction.
 
 ```{r, fig.width = 7, fig.height = 6}
 Brennecke_HVG <- BrenneckeGetVariableGenes(
@@ -140,9 +147,10 @@ Brennecke_HVG <- BrenneckeGetVariableGenes(
 )
 ```
 
-This function returns a matrix of significant genes as well as their estimated effect size (difference 
-between observed and expected coefficient of variation), and their significance as raw p.values and 
-FDR corrected q.values. For now we will just keep the names of the significant HVG genes.
+This function returns a matrix of significant genes as well as their estimated
+effect size (difference between observed and expected coefficient of variation),
+and their significance as raw p.values and FDR corrected q.values. For now we
+will just keep the names of the significant HVG genes.
 
 ```{r}
 HVG_genes <- Brennecke_HVG$Gene
@@ -154,23 +162,73 @@ How many genes were signifcant using BrenneckeGetVariableGenes?
 ```{r, echo=FALSE, fig.width = 8.5, fig.height = 6}
 length(HVG_genes)
 ```
+
+#### Highly Variable Genes - simpleSingleCell method
+
+The Bioconductor
+[simpleSingleCell](https://bioconductor.org/packages/release/workflows/html/simpleSingleCell.html)
+workflow has a great deal of excellent material to help your analyses. Here, we
+show how to identify highly variable genes using functionality from the `scran`
+package.
+
+This method assumes that technical variance is captured by a Poisson
+distribution, and that variance beyond that explained by a Poisson distribution
+represents biological variance of interest. This approach separates the
+biological component of the variance from the technical component and thus can
+rank genes based on their "biological" variance. This model also provides
+p-values (with FDR adjustment) that can be used to identify the set of
+"significant" highly variable genes at a given significance level.
+
+```{r hvg-simpleSingleCell}
+### mamke a technical trend of variance based on Poisson
+var.fit <- trendVar(deng, parametric=TRUE, loess.args=list(span=0.4), use.spikes = FALSE)
+var.out <- decomposeVar(deng, var.fit)
+plot(var.out$mean, var.out$total, pch=16, cex=0.6, xlab="Mean log-expression", 
+     ylab="Variance of log-expression")
+points(var.out$mean[isSpike(deng)], var.out$total[isSpike(deng)], col="red", pch=16)
+curve(var.fit$trend(x), col="dodgerblue", add=TRUE, lwd=2)
+
+chosen.genes <- order(var.out$bio, decreasing=TRUE)[1:10]
+plotExpression(deng, rownames(var.out)[chosen.genes], 
+               point_alpha=0.5, jitter_type="jitter")
+
+top.dec <- var.out[order(var.out$bio, decreasing=TRUE),]
+                                        # the highly variable genes with largest biological components
+head(top.dec)
+simplesinglecell_genes <- rownames(top.dec)[top.dec$FDR < 0.001]
+table(top.dec$FDR < 0.001)
+```
+
+If we set an FDR threshold of 0.1%, this approach identifies around 1300 highly
+variable genes.
+
+The output of this variance modelling can be used as input to a `denoisePCA()`
+function to compute "denoised" principal components for clustering and other
+downstream analyses (details not shown here; please see the `simpleSingleCell`
+workflow).
+
+
 #### High Dropout Genes
 
-An alternative to finding HVGs is to identify genes with unexpectedly high numbers of zeros.
-The frequency of zeros, known as the "dropout rate", is very closely related to expression level
-in scRNASeq data. Zeros are the dominant feature of single-cell RNASeq data, typically accounting
-for over half of the entries in the final expression matrix. These zeros predominantly result
-from the failure of mRNAs failing to be reversed transcribed [(Andrews and Hemberg, 2016)](http://www.biorxiv.org/content/early/2017/05/25/065094). Reverse transcription
-is an enzyme reaction thus can be modelled using the Michaelis-Menten equation:
+An alternative to finding HVGs is to identify genes with unexpectedly high
+numbers of zeros. The frequency of zeros, known as the "dropout rate", is very
+closely related to expression level in scRNASeq data. Zeros are the dominant
+feature of single-cell RNASeq data, typically accounting for over half of the
+entries in the final expression matrix. These zeros predominantly result from
+the failure of mRNAs failing to be reversed transcribed [(Andrews and Hemberg,
+2016)](http://www.biorxiv.org/content/early/2017/05/25/065094). Reverse
+transcription is an enzyme reaction thus can be modelled using the
+Michaelis-Menten equation:
 
 $$P_{dropout} = 1 - S/(K + S)$$
 
-where $S$ is the mRNA concentration in the cell (we will estimate this as average expression)
-and $K$ is the Michaelis-Menten constant.
+where $S$ is the mRNA concentration in the cell (we will estimate this as
+average expression) and $K$ is the Michaelis-Menten constant.
 
-Because the Michaelis-Menten equation is a convex non-linear function, genes which are
-differentially expression across two or more populations of cells in our dataset will
-be shifted up/right of the Michaelis-Menten model (see Figure below).
+Because the Michaelis-Menten equation is a convex non-linear function, genes
+which are differentially expression across two or more populations of cells in
+our dataset will be shifted up/right of the Michaelis-Menten model (see Figure
+below).
 
 ```{r, fig.width = 8.5, fig.height = 6, echo=TRUE}
 K <- 49
@@ -205,9 +263,12 @@ points(
     cex = 3
 )
 ```
-__Note__: add `log="x"` to the `plot` call above to see how this looks on the log scale, which is used in M3Drop figures.
 
-__Exercise 4__: Produce the same plot as above with different expression levels (S1 & S2) and/or mixtures (mix).
+__Note__: add `log="x"` to the `plot` call above to see how this looks on the
+log scale, which is used in M3Drop figures.
+
+__Exercise 4__: Produce the same plot as above with different expression levels
+(S1 & S2) and/or mixtures (mix).
 
 ```{r, include=FALSE}
 plot(
@@ -287,20 +348,95 @@ How many genes were signifcant using NBumiFeatureSelectionCombinedDrop?
 nrow(DropFS)
 ```
 
+#### Residual variance from a (regularized) negative binomial model
+
+In the [normalization chapter](#normalization-theory) we introduced the
+`sctransform` approach to using Pearson residuals from an regularized negative
+binomial generalized linear model to normalize scRNA-seq data. 
+
+The residual variance of genes (i.e. the variance of the Pearson residuals)
+provides a way to identify highly variable genes, where the "variance" is
+decoupled from the average level of expression of the gene.
+
+The residual variance is easily accessible from the `sctransform` output as we
+show below.
+
+First, we run `sctransform` as we did previously. 
+
+```{r sctransform-apply, warning=FALSE, message=FALSE}
+deng_sparse <- as(counts(deng), "dgCMatrix")
+### Genes expressed in at least 5 cells will be kept
+sctnorm_data <- sctransform::vst(umi = deng_sparse, min_cells = 1,
+                                 cell_attr = as.data.frame(colData(deng)),
+                                 latent_var = "log10_total_counts_endogenous")
+sctnorm_data$model_str
+```
+
+
+```{r sctransform-feature-select, warning=FALSE, message=FALSE}
+library(ggplot2)
+ggplot(sctnorm_data$gene_attr, aes(residual_variance)) +
+  geom_histogram(binwidth=0.1) +
+  geom_vline(xintercept=1, color='red') + xlim(0, 10)
+
+sctnorm_data$gene_attr$label <- rownames(sctnorm_data$gene_attr)
+ggplot(sctnorm_data$gene_attr, aes(x = gmean, y=residual_variance)) +
+  geom_point(alpha = 0.6) +
+  geom_point(colour = "firebrick2",
+             data = sctnorm_data$gene_attr[sctnorm_data$gene_attr$residual_variance > 3,]) +
+  scale_x_log10() +
+  geom_hline(yintercept   = 1, size = 3, color = "dodgerblue") +
+  geom_label(aes(label = label),
+             data = sctnorm_data$gene_attr[sctnorm_data$gene_attr$residual_variance > 30,]) +
+  theme_bw()
+
+sct_genes <- rownames(sctnorm_data$gene_attr)[sctnorm_data$gene_attr$residual_variance > 4]
+table(sctnorm_data$gene_attr$residual_variance > 4)
+```
+
+If we set a (relatively arbitrary) threshold of a residual variance greater than
+three marking a "highly variable gene", then we identify around 2000 highly
+variable genes with this `sctransform` approach.
+
+
+[NB: the `deng` data is extremely high depth for scRNA-seq data, so not the most
+applicable dataset for `sctransform`, but we include this analysis here to
+demonstrate the method rather than make any evaluation of its performance in
+general.]
+
+
+Although not explored here, the _deviance_ statistic from the regularized NB GLM
+fit provides a natural way to select informative features for downstream
+analyses.
+
+The [deviance](https://en.wikipedia.org/wiki/Deviance_(statistics)) is a
+goodness-of-fit statistic for a statistical model. As Wikipedia notes, deviance
+is a generalization of the idea of using the sum of squares of residuals in
+ordinary least squares to cases where model-fitting is achieved by maximum
+likelihood. It plays an important role in exponential dispersion models and
+generalized linear models, such as the negative binomial model.
+
+However, `sctransform` does not seem set up to use the model deviance to select
+informative features, but we expect this could be a direction the field goes in
+the near future. Keep an eye out!
+
+
 ### Correlated Expression
 
-A completely different approach to feature selection is to use gene-gene correlations. This method
-is based on the idea that multiple genes will be differentially expressed between different cell-types
-or cell-states. Genes which are expressed in the same cell-population will be positively correlated
-with each other where as genes expressed in different cell-populations will be negatively correated with
-each other. Thus important genes can be identified by the magnitude of their correlation
-with other genes.
+A completely different approach to feature selection is to use gene-gene
+correlations. This method is based on the idea that multiple genes will be
+differentially expressed between different cell-types or cell-states. Genes
+which are expressed in the same cell-population will be positively correlated
+with each other where as genes expressed in different cell-populations will be
+negatively correated with each other. Thus important genes can be identified by
+the magnitude of their correlation with other genes.
 
-The limitation of this method is that it assumes technical noise is random and independent for each cell,
-thus shouldn't produce gene-gene correlations, but this assumption is violated by batch effects which are
-generally systematic between different experimental batches and will produce gene-gene correlations. As a
-result it is more appropriate to take the top few thousand genes as ranked by gene-gene correlation than
-consider the significance of the correlations.
+The limitation of this method is that it assumes technical noise is random and
+independent for each cell, thus shouldn't produce gene-gene correlations, but
+this assumption is violated by batch effects which are generally systematic
+between different experimental batches and will produce gene-gene correlations.
+As a result it is more appropriate to take the top few thousand genes as ranked
+by gene-gene correlation than consider the significance of the correlations.
 
 
 ```{r, eval=FALSE}
@@ -308,55 +444,11 @@ cor_feat <- M3Drop::corFS(expr_matrix)
 Cor_genes <- names(cor_feat)[1:1500]
 ```
 
-Lastly, another common method for feature selection in scRNASeq data is to use PCA loadings. Genes with
-high PCA loadings are likely to be highly variable and correlated with many other variable genes, thus
-may be relevant to the underlying biology. However, as with gene-gene correlations PCA loadings tend to
-be susceptible to detecting systematic variation due to batch effects; thus it is recommended to plot the PCA
-results to determine those components corresponding to the biological variation rather than batch effects.
-
-```{r, fig.width=7, fig.height=6}
-# PCA is typically performed on log-transformed expression data
-pca <- prcomp(log(expr_matrix + 1) / log(2))
 
-# plot projection
-plot(
-    pca$rotation[,1], 
-    pca$rotation[,2], 
-    pch = 16, 
-    col = cell_colors[as.factor(celltype_labs)]
-) 
-# calculate loadings for components 1 and 2
-score <- rowSums(abs(pca$x[,c(1,2)])) 
-names(score) <- rownames(expr_matrix)
-score <- score[order(-score)]
-PCA_genes <- names(score[1:1500])
-```
-__Exercise 6__
-Consider the top 5 principal components. Which appear to be most biologically relevant? How does the top 1,500
-features change if you consider the loadings for those components?
-```{r, include=FALSE}
-plot(
-    pca$rotation[,2], 
-    pca$rotation[,3], 
-    pch = 16, 
-    col = cell_colors[as.factor(celltype_labs)]
-)
-plot(
-    pca$rotation[,3], 
-    pca$rotation[,4], 
-    pch = 16, 
-    col = cell_colors[as.factor(celltype_labs)]
-)
-# calculate loadings for components 1 and 2
-score <- rowSums(abs(pca$x[,c(2, 3, 4)]))
-names(score) <- rownames(expr_matrix)
-score <- score[order(-score)]
-PCA_genes2 = names(score[1:1500])
-```
 ### Comparing Methods
 
-We can check whether the identified features really do represent genes differentially expressed between
-cell-types in this dataset.
+We can check whether the identified features really do represent genes
+differentially expressed between cell-types in this dataset.
 
 ```{r, fig.width = 7, fig.height = 10}
 M3DropExpressionHeatmap(
@@ -367,13 +459,25 @@ M3DropExpressionHeatmap(
 ```
 
 We can also consider how consistent each feature selection method is with the others using the Jaccard Index:
+
 ```{r}
 J <- sum(M3Drop_genes %in% HVG_genes)/length(unique(c(M3Drop_genes, HVG_genes)))
 ```
 
-__Exercise 7__
+__Exercise 6__
+
+Plot the expression of the features for each of the other methods. Which appear
+to be differentially expressed? How consistent are the different methods for
+this dataset?
+
+```{r, fig.width = 7, fig.height = 10}
+M3DropExpressionHeatmap(
+    DANB_genes,
+    expr_matrix,
+    cell_labels = celltype_labs
+)
+```
 
-Plot the expression of the features for each of the other methods. Which appear to be differentially expressed? How consistent are the different methods for this dataset?
 
 ```{r, eval=FALSE, include=FALSE, fig.width = 7, fig.height = 10}
 M3DropExpressionHeatmap(
@@ -393,7 +497,7 @@ M3DropExpressionHeatmap(
 
 ```{r, eval=FALSE, include=FALSE, fig.width = 7, fig.height = 10}
 M3DropExpressionHeatmap(
-    PCA_genes,
+    simplesinglecell_genes,
     expr_matrix,
     cell_labels = celltype_labs
 )
@@ -401,19 +505,21 @@ M3DropExpressionHeatmap(
 
 ```{r, eval=FALSE, include=FALSE, fig.width = 7, fig.height = 10}
 M3DropExpressionHeatmap(
-    PCA_genes2,
+    sct_genes,
     expr_matrix,
     cell_labels = celltype_labs
 )
 ```
 
-```{r, eval=FALSE, include=FALSE}
+Jaccard index comparison of sets of informative features:
+
+```{r, eval=TRUE, include=TRUE}
 list_of_features <- list(
-    M3Drop_genes, 
-    HVG_genes, 
-    Cor_genes, 
-    PCA_genes, 
-    PCA_genes2
+  M3Drop_genes,
+  DANB_genes,
+  HVG_genes, 
+  simplesinglecell_genes, 
+  sct_genes
 )
 Out <- matrix(
     0, 
@@ -426,7 +532,8 @@ for(i in 1:length(list_of_features) ) {
             length(unique(c(list_of_features[[i]], list_of_features[[j]])))
      }
 }
-colnames(Out) <- rownames(Out) <- c("M3Drop", "HVG", "Cor", "PCA", "PCA2")
+colnames(Out) <- rownames(Out) <- c("M3Drop", "DANB", "Brennecke", "simpleSingleCell", "sctransform")
+Out
 ```
 
 
-- 
GitLab