From 31532ab6e966b4ba662da495d9dd89ef524e1662 Mon Sep 17 00:00:00 2001 From: Davis McCarthy <davismcc@gmail.com> Date: Tue, 1 Oct 2019 18:54:00 +1000 Subject: [PATCH] Updating feature selection chapter --- course_files/feature-selection.Rmd | 317 +++++++++++++++++++---------- 1 file changed, 212 insertions(+), 105 deletions(-) diff --git a/course_files/feature-selection.Rmd b/course_files/feature-selection.Rmd index 965e18f..58a4cc4 100644 --- a/course_files/feature-selection.Rmd +++ b/course_files/feature-selection.Rmd @@ -4,7 +4,7 @@ output: html_document --- ```{r setup, echo=FALSE} -knitr::opts_chunk$set(out.width='90%', fig.align = 'center', eval=TRUE) +knitr::opts_chunk$set(out.width='90%', fig.align = 'center', eval=TRUE, warning=FALSE, message=FALSE) knitr::opts_knit$set(root.dir = normalizePath("..")) ``` @@ -16,6 +16,9 @@ library(matrixStats) library(M3Drop) library(RColorBrewer) library(SingleCellExperiment) +library(Polychrome) +library(scater) +library(scran) set.seed(1) ``` @@ -47,7 +50,8 @@ For this section we will continue working with the Deng data. ```{r} deng <- readRDS("data/deng/deng-reads.rds") celltype_labs <- colData(deng)$cell_type2 -cell_colors <- brewer.pal(max(3,length(unique(celltype_labs))), "Set3") +cell_colors <- createPalette(10, c("#010101", "#ff0000"), M=1000) +names(cell_colors) <- unique(as.character(celltype_labs)) ``` Feature selection is performed after QC, however this data has already been QCed so @@ -87,12 +91,13 @@ first is to identify genes which behave differently from a null model describing just the technical noise expected in the dataset. If the dataset contains spike-in RNAs they can be used to directly model -technical noise. However, measurements of spike-ins may not experience -the same technical noise as endogenous transcripts [(Svensson et al., 2017)](https://www.nature.com/nmeth/journal/v14/n4/full/nmeth.4220.html). -In addition, scRNASeq experiments often contain only a small number of -spike-ins which reduces our confidence in fitted model parameters. +technical noise. However, measurements of spike-ins may not experience the same +technical noise as endogenous transcripts [(Svensson et al., +2017)](https://www.nature.com/nmeth/journal/v14/n4/full/nmeth.4220.html). In +addition, scRNASeq experiments often contain only a small number of spike-ins +which reduces our confidence in fitted model parameters. -#### Highly Variable Genes +#### Highly Variable Genes - Brennecke method The first method proposed to identify features in scRNASeq datasets was to identify highly variable genes (HVG). HVG assumes that if genes @@ -117,20 +122,22 @@ plot( main="" ) ``` -A popular method to correct for the relationship between variance and mean expression -was proposed by [Brennecke et al.](http://www.nature.com/nmeth/journal/v10/n11/full/nmeth.2645.html). -To use the Brennecke method, we first normalize for library size then calculate -the mean and the square coefficient of variation (variation divided by -the squared mean expression). A quadratic curve is fit to the relationship -between these two variables for the ERCC spike-in, and then a chi-square test is used to find genes -significantly above the curve. This method is included in the M3Drop package as the -Brennecke_getVariableGenes(counts, spikes) function. However, this dataset does not contain spike-ins -so we will use the entire dataset to estimate the technical noise. - -In the figure below the red curve -is the fitted technical noise model and the dashed line is the 95% -CI. Pink dots are the genes with significant biological variability -after multiple-testing correction. + +An early method to correct for the relationship between variance and mean +expression was proposed by [Brennecke et +al.](http://www.nature.com/nmeth/journal/v10/n11/full/nmeth.2645.html). To use +the Brennecke method, we first normalize for library size then calculate the +mean and the square coefficient of variation (variation divided by the squared +mean expression). A quadratic curve is fit to the relationship between these two +variables for the ERCC spike-in, and then a chi-square test is used to find +genes significantly above the curve. This method is included in the M3Drop +package as the Brennecke_getVariableGenes(counts, spikes) function. However, +this dataset does not contain spike-ins so we will use the entire dataset to +estimate the technical noise. + +In the figure below the red curve is the fitted technical noise model and the +dashed line is the 95% CI. Pink dots are the genes with significant biological +variability after multiple-testing correction. ```{r, fig.width = 7, fig.height = 6} Brennecke_HVG <- BrenneckeGetVariableGenes( @@ -140,9 +147,10 @@ Brennecke_HVG <- BrenneckeGetVariableGenes( ) ``` -This function returns a matrix of significant genes as well as their estimated effect size (difference -between observed and expected coefficient of variation), and their significance as raw p.values and -FDR corrected q.values. For now we will just keep the names of the significant HVG genes. +This function returns a matrix of significant genes as well as their estimated +effect size (difference between observed and expected coefficient of variation), +and their significance as raw p.values and FDR corrected q.values. For now we +will just keep the names of the significant HVG genes. ```{r} HVG_genes <- Brennecke_HVG$Gene @@ -154,23 +162,73 @@ How many genes were signifcant using BrenneckeGetVariableGenes? ```{r, echo=FALSE, fig.width = 8.5, fig.height = 6} length(HVG_genes) ``` + +#### Highly Variable Genes - simpleSingleCell method + +The Bioconductor +[simpleSingleCell](https://bioconductor.org/packages/release/workflows/html/simpleSingleCell.html) +workflow has a great deal of excellent material to help your analyses. Here, we +show how to identify highly variable genes using functionality from the `scran` +package. + +This method assumes that technical variance is captured by a Poisson +distribution, and that variance beyond that explained by a Poisson distribution +represents biological variance of interest. This approach separates the +biological component of the variance from the technical component and thus can +rank genes based on their "biological" variance. This model also provides +p-values (with FDR adjustment) that can be used to identify the set of +"significant" highly variable genes at a given significance level. + +```{r hvg-simpleSingleCell} +### mamke a technical trend of variance based on Poisson +var.fit <- trendVar(deng, parametric=TRUE, loess.args=list(span=0.4), use.spikes = FALSE) +var.out <- decomposeVar(deng, var.fit) +plot(var.out$mean, var.out$total, pch=16, cex=0.6, xlab="Mean log-expression", + ylab="Variance of log-expression") +points(var.out$mean[isSpike(deng)], var.out$total[isSpike(deng)], col="red", pch=16) +curve(var.fit$trend(x), col="dodgerblue", add=TRUE, lwd=2) + +chosen.genes <- order(var.out$bio, decreasing=TRUE)[1:10] +plotExpression(deng, rownames(var.out)[chosen.genes], + point_alpha=0.5, jitter_type="jitter") + +top.dec <- var.out[order(var.out$bio, decreasing=TRUE),] + # the highly variable genes with largest biological components +head(top.dec) +simplesinglecell_genes <- rownames(top.dec)[top.dec$FDR < 0.001] +table(top.dec$FDR < 0.001) +``` + +If we set an FDR threshold of 0.1%, this approach identifies around 1300 highly +variable genes. + +The output of this variance modelling can be used as input to a `denoisePCA()` +function to compute "denoised" principal components for clustering and other +downstream analyses (details not shown here; please see the `simpleSingleCell` +workflow). + + #### High Dropout Genes -An alternative to finding HVGs is to identify genes with unexpectedly high numbers of zeros. -The frequency of zeros, known as the "dropout rate", is very closely related to expression level -in scRNASeq data. Zeros are the dominant feature of single-cell RNASeq data, typically accounting -for over half of the entries in the final expression matrix. These zeros predominantly result -from the failure of mRNAs failing to be reversed transcribed [(Andrews and Hemberg, 2016)](http://www.biorxiv.org/content/early/2017/05/25/065094). Reverse transcription -is an enzyme reaction thus can be modelled using the Michaelis-Menten equation: +An alternative to finding HVGs is to identify genes with unexpectedly high +numbers of zeros. The frequency of zeros, known as the "dropout rate", is very +closely related to expression level in scRNASeq data. Zeros are the dominant +feature of single-cell RNASeq data, typically accounting for over half of the +entries in the final expression matrix. These zeros predominantly result from +the failure of mRNAs failing to be reversed transcribed [(Andrews and Hemberg, +2016)](http://www.biorxiv.org/content/early/2017/05/25/065094). Reverse +transcription is an enzyme reaction thus can be modelled using the +Michaelis-Menten equation: $$P_{dropout} = 1 - S/(K + S)$$ -where $S$ is the mRNA concentration in the cell (we will estimate this as average expression) -and $K$ is the Michaelis-Menten constant. +where $S$ is the mRNA concentration in the cell (we will estimate this as +average expression) and $K$ is the Michaelis-Menten constant. -Because the Michaelis-Menten equation is a convex non-linear function, genes which are -differentially expression across two or more populations of cells in our dataset will -be shifted up/right of the Michaelis-Menten model (see Figure below). +Because the Michaelis-Menten equation is a convex non-linear function, genes +which are differentially expression across two or more populations of cells in +our dataset will be shifted up/right of the Michaelis-Menten model (see Figure +below). ```{r, fig.width = 8.5, fig.height = 6, echo=TRUE} K <- 49 @@ -205,9 +263,12 @@ points( cex = 3 ) ``` -__Note__: add `log="x"` to the `plot` call above to see how this looks on the log scale, which is used in M3Drop figures. -__Exercise 4__: Produce the same plot as above with different expression levels (S1 & S2) and/or mixtures (mix). +__Note__: add `log="x"` to the `plot` call above to see how this looks on the +log scale, which is used in M3Drop figures. + +__Exercise 4__: Produce the same plot as above with different expression levels +(S1 & S2) and/or mixtures (mix). ```{r, include=FALSE} plot( @@ -287,20 +348,95 @@ How many genes were signifcant using NBumiFeatureSelectionCombinedDrop? nrow(DropFS) ``` +#### Residual variance from a (regularized) negative binomial model + +In the [normalization chapter](#normalization-theory) we introduced the +`sctransform` approach to using Pearson residuals from an regularized negative +binomial generalized linear model to normalize scRNA-seq data. + +The residual variance of genes (i.e. the variance of the Pearson residuals) +provides a way to identify highly variable genes, where the "variance" is +decoupled from the average level of expression of the gene. + +The residual variance is easily accessible from the `sctransform` output as we +show below. + +First, we run `sctransform` as we did previously. + +```{r sctransform-apply, warning=FALSE, message=FALSE} +deng_sparse <- as(counts(deng), "dgCMatrix") +### Genes expressed in at least 5 cells will be kept +sctnorm_data <- sctransform::vst(umi = deng_sparse, min_cells = 1, + cell_attr = as.data.frame(colData(deng)), + latent_var = "log10_total_counts_endogenous") +sctnorm_data$model_str +``` + + +```{r sctransform-feature-select, warning=FALSE, message=FALSE} +library(ggplot2) +ggplot(sctnorm_data$gene_attr, aes(residual_variance)) + + geom_histogram(binwidth=0.1) + + geom_vline(xintercept=1, color='red') + xlim(0, 10) + +sctnorm_data$gene_attr$label <- rownames(sctnorm_data$gene_attr) +ggplot(sctnorm_data$gene_attr, aes(x = gmean, y=residual_variance)) + + geom_point(alpha = 0.6) + + geom_point(colour = "firebrick2", + data = sctnorm_data$gene_attr[sctnorm_data$gene_attr$residual_variance > 3,]) + + scale_x_log10() + + geom_hline(yintercept = 1, size = 3, color = "dodgerblue") + + geom_label(aes(label = label), + data = sctnorm_data$gene_attr[sctnorm_data$gene_attr$residual_variance > 30,]) + + theme_bw() + +sct_genes <- rownames(sctnorm_data$gene_attr)[sctnorm_data$gene_attr$residual_variance > 4] +table(sctnorm_data$gene_attr$residual_variance > 4) +``` + +If we set a (relatively arbitrary) threshold of a residual variance greater than +three marking a "highly variable gene", then we identify around 2000 highly +variable genes with this `sctransform` approach. + + +[NB: the `deng` data is extremely high depth for scRNA-seq data, so not the most +applicable dataset for `sctransform`, but we include this analysis here to +demonstrate the method rather than make any evaluation of its performance in +general.] + + +Although not explored here, the _deviance_ statistic from the regularized NB GLM +fit provides a natural way to select informative features for downstream +analyses. + +The [deviance](https://en.wikipedia.org/wiki/Deviance_(statistics)) is a +goodness-of-fit statistic for a statistical model. As Wikipedia notes, deviance +is a generalization of the idea of using the sum of squares of residuals in +ordinary least squares to cases where model-fitting is achieved by maximum +likelihood. It plays an important role in exponential dispersion models and +generalized linear models, such as the negative binomial model. + +However, `sctransform` does not seem set up to use the model deviance to select +informative features, but we expect this could be a direction the field goes in +the near future. Keep an eye out! + + ### Correlated Expression -A completely different approach to feature selection is to use gene-gene correlations. This method -is based on the idea that multiple genes will be differentially expressed between different cell-types -or cell-states. Genes which are expressed in the same cell-population will be positively correlated -with each other where as genes expressed in different cell-populations will be negatively correated with -each other. Thus important genes can be identified by the magnitude of their correlation -with other genes. +A completely different approach to feature selection is to use gene-gene +correlations. This method is based on the idea that multiple genes will be +differentially expressed between different cell-types or cell-states. Genes +which are expressed in the same cell-population will be positively correlated +with each other where as genes expressed in different cell-populations will be +negatively correated with each other. Thus important genes can be identified by +the magnitude of their correlation with other genes. -The limitation of this method is that it assumes technical noise is random and independent for each cell, -thus shouldn't produce gene-gene correlations, but this assumption is violated by batch effects which are -generally systematic between different experimental batches and will produce gene-gene correlations. As a -result it is more appropriate to take the top few thousand genes as ranked by gene-gene correlation than -consider the significance of the correlations. +The limitation of this method is that it assumes technical noise is random and +independent for each cell, thus shouldn't produce gene-gene correlations, but +this assumption is violated by batch effects which are generally systematic +between different experimental batches and will produce gene-gene correlations. +As a result it is more appropriate to take the top few thousand genes as ranked +by gene-gene correlation than consider the significance of the correlations. ```{r, eval=FALSE} @@ -308,55 +444,11 @@ cor_feat <- M3Drop::corFS(expr_matrix) Cor_genes <- names(cor_feat)[1:1500] ``` -Lastly, another common method for feature selection in scRNASeq data is to use PCA loadings. Genes with -high PCA loadings are likely to be highly variable and correlated with many other variable genes, thus -may be relevant to the underlying biology. However, as with gene-gene correlations PCA loadings tend to -be susceptible to detecting systematic variation due to batch effects; thus it is recommended to plot the PCA -results to determine those components corresponding to the biological variation rather than batch effects. - -```{r, fig.width=7, fig.height=6} -# PCA is typically performed on log-transformed expression data -pca <- prcomp(log(expr_matrix + 1) / log(2)) -# plot projection -plot( - pca$rotation[,1], - pca$rotation[,2], - pch = 16, - col = cell_colors[as.factor(celltype_labs)] -) -# calculate loadings for components 1 and 2 -score <- rowSums(abs(pca$x[,c(1,2)])) -names(score) <- rownames(expr_matrix) -score <- score[order(-score)] -PCA_genes <- names(score[1:1500]) -``` -__Exercise 6__ -Consider the top 5 principal components. Which appear to be most biologically relevant? How does the top 1,500 -features change if you consider the loadings for those components? -```{r, include=FALSE} -plot( - pca$rotation[,2], - pca$rotation[,3], - pch = 16, - col = cell_colors[as.factor(celltype_labs)] -) -plot( - pca$rotation[,3], - pca$rotation[,4], - pch = 16, - col = cell_colors[as.factor(celltype_labs)] -) -# calculate loadings for components 1 and 2 -score <- rowSums(abs(pca$x[,c(2, 3, 4)])) -names(score) <- rownames(expr_matrix) -score <- score[order(-score)] -PCA_genes2 = names(score[1:1500]) -``` ### Comparing Methods -We can check whether the identified features really do represent genes differentially expressed between -cell-types in this dataset. +We can check whether the identified features really do represent genes +differentially expressed between cell-types in this dataset. ```{r, fig.width = 7, fig.height = 10} M3DropExpressionHeatmap( @@ -367,13 +459,25 @@ M3DropExpressionHeatmap( ``` We can also consider how consistent each feature selection method is with the others using the Jaccard Index: + ```{r} J <- sum(M3Drop_genes %in% HVG_genes)/length(unique(c(M3Drop_genes, HVG_genes))) ``` -__Exercise 7__ +__Exercise 6__ + +Plot the expression of the features for each of the other methods. Which appear +to be differentially expressed? How consistent are the different methods for +this dataset? + +```{r, fig.width = 7, fig.height = 10} +M3DropExpressionHeatmap( + DANB_genes, + expr_matrix, + cell_labels = celltype_labs +) +``` -Plot the expression of the features for each of the other methods. Which appear to be differentially expressed? How consistent are the different methods for this dataset? ```{r, eval=FALSE, include=FALSE, fig.width = 7, fig.height = 10} M3DropExpressionHeatmap( @@ -393,7 +497,7 @@ M3DropExpressionHeatmap( ```{r, eval=FALSE, include=FALSE, fig.width = 7, fig.height = 10} M3DropExpressionHeatmap( - PCA_genes, + simplesinglecell_genes, expr_matrix, cell_labels = celltype_labs ) @@ -401,19 +505,21 @@ M3DropExpressionHeatmap( ```{r, eval=FALSE, include=FALSE, fig.width = 7, fig.height = 10} M3DropExpressionHeatmap( - PCA_genes2, + sct_genes, expr_matrix, cell_labels = celltype_labs ) ``` -```{r, eval=FALSE, include=FALSE} +Jaccard index comparison of sets of informative features: + +```{r, eval=TRUE, include=TRUE} list_of_features <- list( - M3Drop_genes, - HVG_genes, - Cor_genes, - PCA_genes, - PCA_genes2 + M3Drop_genes, + DANB_genes, + HVG_genes, + simplesinglecell_genes, + sct_genes ) Out <- matrix( 0, @@ -426,7 +532,8 @@ for(i in 1:length(list_of_features) ) { length(unique(c(list_of_features[[i]], list_of_features[[j]]))) } } -colnames(Out) <- rownames(Out) <- c("M3Drop", "HVG", "Cor", "PCA", "PCA2") +colnames(Out) <- rownames(Out) <- c("M3Drop", "DANB", "Brennecke", "simpleSingleCell", "sctransform") +Out ``` -- GitLab