Commit c2110f29 authored by Davis McCarthy's avatar Davis McCarthy
Browse files

Aesthetic tweaks across chapters

parent 01353111
......@@ -1260,4 +1260,108 @@ doi = {10.18637/jss.v059.i10}
pages={574574},
year={2019},
publisher={Cold Spring Harbor Laboratory}
}
\ No newline at end of file
}
@ARTICLE{Soneson2018-hy,
title = "{Bias, robustness and scalability in single-cell differential
expression analysis}",
author = "Soneson, Charlotte and Robinson, Mark D",
abstract = "Many methods have been used to determine differential gene
expression from single-cell RNA (scRNA)-seq data. We evaluated
36 approaches using experimental and synthetic data and found
considerable differences in the number and characteristics of
the genes that are called differentially expressed. Prefiltering
of lowly expressed genes has important effects, particularly for
some of the methods developed for bulk RNA-seq data analysis.
However, we found that bulk RNA-seq analysis methods do not
generally perform worse than those developed specifically for
scRNA-seq. We also present conquer, a repository of consistently
processed, analysis-ready public scRNA-seq data sets that is
aimed at simplifying method evaluation and reanalysis of
published results. Each data set provides abundance estimates
for both genes and transcripts, as well as quality control and
exploratory analysis reports.",
journal = "Nature methods",
publisher = "Nature Publishing Group, a division of Macmillan Publishers
Limited. All Rights Reserved.",
month = feb,
year = 2018,
url = "http://dx.doi.org/10.1038/nmeth.4612",
issn = "1548-7091",
doi = "10.1038/nmeth.4612"
}
@ARTICLE{Finak2015-ow,
title = "{MAST: a flexible statistical framework for assessing
transcriptional changes and characterizing heterogeneity in
single-cell RNA sequencing data}",
author = "Finak, Greg and McDavid, Andrew and Yajima, Masanao and Deng,
Jingyuan and Gersuk, Vivian and Shalek, Alex K and Slichter,
Chloe K and Miller, Hannah W and McElrath, M Juliana and Prlic,
Martin and Linsley, Peter S and Gottardo, Raphael",
abstract = "Single-cell transcriptomics reveals gene expression heterogeneity
but suffers from stochastic dropout and characteristic bimodal
expression distributions in which expression is either strongly
non-zero or non-detectable. We propose a two-part, generalized
linear model for such bimodal data that parameterizes both of
these features. We argue that the cellular detection rate, the
fraction of genes expressed in a cell, should be adjusted for as
a source of nuisance variation. Our model provides gene set
enrichment analysis tailored to single-cell data. It provides
insights into how networks of co-expressed genes evolve across an
experimental treatment. MAST is available at
https://github.com/RGLab/MAST .",
journal = "Genome biology",
volume = 16,
number = 1,
pages = "1--13",
year = 2015,
url = "http://dx.doi.org/10.1186/s13059-015-0844-5",
issn = "1465-6906, 1474-760X",
doi = "10.1186/s13059-015-0844-5"
}
@ARTICLE{Bais2019-wv,
title = "{scds: Computational Annotation of Doublets in Single-Cell RNA
Sequencing Data}",
author = "Bais, Abha S and Kostka, Dennis",
abstract = "MOTIVATION: Single-cell RNA sequencing (scRNA-seq) technologies
enable the study of transcriptional heterogeneity at the
resolution of individual cells and have an increasing impact on
biomedical research. However, it is known that these methods
sometimes wrongly consider two or more cells as single cells, and
that a number of so-called doublets is present in the output of
such experiments. Treating doublets as single cells in downstream
analyses can severely bias a study's conclusions, and therefore
computational strategies for the identification of doublets are
needed. RESULTS: With scds, we propose two new approaches for in
silico doublet identification: Co-expression based doublet
scoring (cxds) and binary classification based doublet scoring
(bcds). The co-expression based approach, cxds, utilizes
binarized (absence/presence) gene expression data and, employing
a binomial model for the co-expression of pairs of genes, yields
interpretable doublet annotations. bcds, on the other hand, uses
a binary classification approach to discriminate artificial
doublets from original data. We apply our methods and existing
computational doublet identification approaches to four data sets
with experimental doublet annotations and find that our methods
perform at least as well as the state of the art, at comparably
little computational cost. We observe appreciable differences
between methods and across data sets and that no approach
dominates all others. In summary, scds presents a scalable,
competitive approach that allows for doublet annotation of data
sets with thousands of cells in a matter of seconds.
AVAILABILITY: scds is implemented as a Bioconductor R package
(doi: 10.18129/B9.bioc.scds). SUPPLEMENTARY INFORMATION:
Supplementary data are available at Bioinformatics online.",
journal = "Bioinformatics",
month = sep,
year = 2019,
url = "http://dx.doi.org/10.1093/bioinformatics/btz698",
language = "en",
issn = "1367-4803, 1367-4811",
pmid = "31501871",
doi = "10.1093/bioinformatics/btz698"
}
......@@ -46,7 +46,7 @@ that it is typically much easier to visualize the data in a 2 or
* Scalability: in the last few years the number of cells in scRNA-seq experiments has grown by several orders of magnitude from ~$10^2$ to ~$10^6$
### unsupervised Clustering methods
### Unsupervised clustering methods
Three main ingredients of a complete clustering method:
......
......@@ -3,7 +3,7 @@ knit: bookdown::preview_chapter
---
```{r setup, echo=FALSE}
knitr::opts_chunk$set(out.width='90%', fig.align = 'center', echo=FALSE, eval=TRUE)
knitr::opts_chunk$set(out.width='90%', fig.align = 'center', echo=FALSE, eval=TRUE, warning=FALSE, message=FALSE)
knitr::opts_knit$set(root.dir = normalizePath(".."))
```
......
......@@ -4,7 +4,7 @@ knit: bookdown::preview_chapter
```{r, echo=FALSE}
library(knitr)
opts_chunk$set(out.width='90%', fig.align = 'center', eval=TRUE)
opts_chunk$set(out.width='90%', fig.align = 'center', eval=TRUE, warning=FALSE, message=FALSE)
knitr::opts_knit$set(root.dir = normalizePath(".."))
```
......
......@@ -13,38 +13,95 @@ knitr::opts_knit$set(root.dir = normalizePath(".."))
### Bulk RNA-seq
One of the most common types of analyses when working with bulk RNA-seq
data is to identify differentially expressed genes. By comparing the
genes that change between two conditions, e.g. mutant and wild-type or
stimulated and unstimulated, it is possible to characterize the
molecular mechanisms underlying the change.
Several different methods,
e.g. [DESeq2](https://bioconductor.org/packages/DESeq2) and
[edgeR](https://bioconductor.org/packages/release/bioc/html/edgeR.html),
have been developed for bulk RNA-seq. Moreover, there are also
extensive
One of the most common types of analyses when working with bulk RNA-seq data is
to identify differentially expressed genes. By comparing the genes that change
between two or more conditions, e.g. mutant and wild-type or stimulated and
unstimulated, it is possible to characterize the molecular mechanisms underlying
the change.
Several different methods, e.g.
[edgeR](https://bioconductor.org/packages/release/bioc/html/edgeR.html) and
[DESeq2](https://bioconductor.org/packages/DESeq2) and more, have been developed
for bulk RNA-seq and become established as parts of robust and widely-used
analysis workflows. Moreover, there are also extensive
[datasets](http://genomebiology.biomedcentral.com/articles/10.1186/gb-2013-14-9-r95)
available where the RNA-seq data has been validated using
RT-qPCR. These data can be used to benchmark DE finding algorithms and the available evidence suggests that the algorithms are performing quite well.
available where the RNA-seq data has been validated using RT-qPCR. These data
can be used to benchmark DE finding algorithms and the available evidence
suggests that the algorithms are performing well.
### Single cell RNA-seq
In contrast to bulk RNA-seq, in scRNA-seq we usually do not have a defined
set of experimental conditions. Instead, as was shown in a previous chapter
In contrast to bulk RNA-seq, in scRNA-seq we often do not have a defined set
of experimental conditions. Instead, as was shown in a previous chapter
(\@ref(clust-methods)) we can identify the cell groups by using an unsupervised
clustering approach. Once the groups have been identified one can find differentially
expressed genes either by comparing the differences in variance between the groups (like the Kruskal-Wallis test implemented in SC3), or by comparing gene expression between clusters in a pairwise manner. In the following chapter we will mainly consider tools developed for pairwise comparisons.
clustering approach. Once the groups have been identified one can find
differentially expressed genes either by comparing the differences in variance
between the groups (like the Kruskal-Wallis test implemented in SC3), or by
comparing gene expression between clusters in a pairwise manner. In the
following chapter we will mainly consider tools developed for pairwise
comparisons.
These method may also be applied when comparing cells obtained from different
groups or conditions. Such analyses can be complicated by differing cell type
proportions between samples (i.e. distinct samples cell populations; the unit of
replication in the study). In such cases, it is likely beneficial to identify
distinct cell types and conduct differential expression testing between
conditions within each cell type.
### Differences in Distribution
Unlike bulk RNA-seq, we generally have a large number of samples (i.e. cells) for each group we are comparing in single-cell experiments. Thus we can take advantage of the whole distribution of expression values in each group to identify differences between groups rather than only comparing estimates of mean-expression as is standard for bulk RNASeq.
Unlike bulk RNA-seq, we generally have a large number of samples (i.e. cells)
for each group we are comparing in single-cell experiments. Thus we may be able
to take advantage of the whole distribution of expression values in each group
to identify differences between groups rather than only comparing estimates of
mean-expression as is standard for bulk RNASeq.
There are two main approaches to comparing distributions. Firstly, we can use
existing statistical models/distributions and fit the same type of model to the
expression in each group then test for differences in the parameters for each
model, or test whether the model fits better if a particular parameter is allowed
to be different according to group. For instance in Chapter
\@ref(dealing-with-confounders) we used `edgeR` to test whether allowing mean
expression to be different in different batches significantly improved the fit
of a negative binomial model of the data.
Alternatively, we can use a non-parametric test which does not assume that
expression values follow any particular distribution, e.g. the
[Kolmogorov-Smirnov test
(KS-test)](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test).
Non-parametric tests generally convert observed expression values to ranks and
test whether the distribution of ranks for one group are signficantly different
from the distribution of ranks for the other group. However, some non-parametric
methods fail in the presence of a large number of tied values, such as the case
for dropouts (zeros) in single-cell RNA-seq expression data. Moreover, if the
conditions for a parametric test hold, then it will typically be more powerful
than a non-parametric test.
### Benchmarking of DE methods for scRNA-seq data
So far there has been one high-quality benchmarking study of single-cell
differential expression methods [@Soneson2018-hy]. The figure below summarises
the results from that paper (which is well worth reading in full!):
```{r de-benchmarking, out.width='90%', fig.cap="Figure 5 reproduced from Soneson and Robinson (2018). Summary of DE method performance across all major evaluation criteria. Criteria and cutoff values for performance categories are available in the Online Methods. Methods are ranked by their average performance across the criteria, with the numerical encoding good = 2, intermediate = 1, poor = 0. NODES and SAMseq do not return nominal P values and were therefore not evaluated in terms of the FPR."}
knitr::include_graphics("figures/soneson-de-benchmark-fig5.png")
```
There are two main approaches to comparing distributions. Firstly, we can use existing statistical models/distributions and fit the same type of model to the expression in each group then test for differences in the parameters for each model, or test whether the model fits better if a particular paramter is allowed to be different according to group. For instance in Chapter \@ref(dealing-with-confounders) we used edgeR to test whether allowing mean expression to be different in different batches significantly improved the fit of a negative binomial model of the data.
One particularly surprising outcome of this benchmarking study is that almost
all methods designed specifically for the analysis of scRNA-seq data are
outperformed by established bulk RNA-seq DE methods (edgeR, limma) and standard,
classical statistical methods (t-test, Wilcoxon rank-sum tests). MAST
[@Finak2015-ow] is the only method designed specifically for scRNA-seq data that
performs well in this benchmark. These benchmarking results are a credit to the
durability and flexibility of the leading bulk RNA-seq DE methods and a subtle
indictment of the land rush of new scRNA-seq methods that were published without
adequate comparison to existing bulk RNA-seq methods.
Alternatively, we can use a non-parametric test which does not assume that expression values follow any particular distribution, e.g. the [Kolmogorov-Smirnov test (KS-test)](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test). Non-parametric tests generally convert observed expression values to ranks and test whether the distribution of ranks for one group are signficantly different from the distribution of ranks for the other group. However, some non-parametric methods fail in the presence of a large number of tied values, such as the case for dropouts (zeros) in single-cell RNA-seq expression data. Moreover, if the conditions for a parametric test hold, then it will typically be more powerful than a non-parametric test.
### Models of single-cell RNASeq data
### Models of single-cell RNA-seq data
The most common model of RNASeq data is the negative binomial model:
......@@ -67,9 +124,21 @@ $\mu = mu$
Variance:
$\sigma^2 = mu + mu^2/size$
It is parameterized by the mean expression (mu) and the dispersion (size), which is inversely related to the variance. The negative binomial model fits bulk RNA-seq data very well and it is used for most statistical methods designed for such data. In addition, it has been show to fit the distribution of molecule counts obtained from data tagged by unique molecular identifiers (UMIs) quite well ([Grun et al. 2014](http://www.nature.com/nmeth/journal/v11/n6/full/nmeth.2930.html), [Islam et al. 2011](http://genome.cshlp.org/content/21/7/1160)).
However, a raw negative binomial model does not fit full-length transcript data as well due to the high dropout rates relative to the non-zero read counts. For this type of data a variety of zero-inflated negative binomial models have been proposed (e.g. [MAST](https://bioconductor.org/packages/release/bioc/html/MAST.html), [SCDE](https://bioconductor.org/packages/release/bioc/html/scde.html)).
It is parameterized by the mean expression (mu) and the dispersion (size), which
is inversely related to the variance. The negative binomial model fits bulk
RNA-seq data very well and it is used for most statistical methods designed for
such data. In addition, it has been show to fit the distribution of molecule
counts obtained from data tagged by unique molecular identifiers (UMIs) quite
well ([Grun et al.
2014](http://www.nature.com/nmeth/journal/v11/n6/full/nmeth.2930.html), [Islam
et al. 2011](http://genome.cshlp.org/content/21/7/1160)).
However, a raw negative binomial model does not necessarily fit full-length
transcript data as well due to the high dropout rates relative to the non-zero
read counts. For this type of data a variety of zero-inflated negative binomial
models have been proposed (e.g.
[MAST](https://bioconductor.org/packages/release/bioc/html/MAST.html),
[SCDE](https://bioconductor.org/packages/release/bioc/html/scde.html)).
```{r zero-inflation-plot, fig.cap="Zero-inflated Negative Binomial distribution"}
d <- 0.5;
......@@ -92,9 +161,20 @@ $\mu = mu \cdot (1 - d)$
Variance:
$\sigma^2 = \mu \cdot (1-d) \cdot (1 + d \cdot \mu + \mu / size)$
These models introduce a new parameter $d$, for the dropout rate, to the negative binomial model. As we saw in Chapter 19, the dropout rate of a gene is strongly correlated with the mean expression of the gene. Different zero-inflated negative binomial models use different relationships between mu and d and some may fit $\mu$ and $d$ to the expression of each gene independently.
Finally, several methods use a Poisson-Beta distribution which is based on a mechanistic model of transcriptional bursting. There is strong experimental support for this model ([Kim and Marioni, 2013](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2013-14-1-r7)) and it provides a good fit to scRNA-seq data but it is less easy to use than the negative-binomial models and much less existing methods upon which to build than the negative binomial model.
These models introduce a new parameter $d$, for the dropout rate, to the
negative binomial model. As we saw in Chapter 19, the dropout rate of a gene is
strongly correlated with the mean expression of the gene. Different
zero-inflated negative binomial models use different relationships between mu
and d and some may fit $\mu$ and $d$ to the expression of each gene
independently.
Finally, several methods use a Poisson-Beta distribution which is based on a
mechanistic model of transcriptional bursting. There is strong experimental
support for this model ([Kim and Marioni,
2013](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2013-14-1-r7))
and it provides a good fit to scRNA-seq data but it is less easy to use than the
negative-binomial models and much less existing methods upon which to build than
the negative binomial model.
```{r pois-beta-plot, fit.cap="Poisson-Beta distribution"}
a <- 0.1
......@@ -115,9 +195,17 @@ $\mu = g \cdot a / (a + b)$
Variance:
$\sigma^2 = g^2 \cdot a \cdot b/((a + b + 1) \cdot (a + b)^2)$
This model uses three parameters: $a$ the rate of activation of transcription; $b$ the rate of inhibition of transcription; and $g$ the rate of transcript production while transcription is active at the locus. Differential expression methods may test each of the parameters for differences across groups or only one (often $g$).
This model uses three parameters: $a$ the rate of activation of transcription;
$b$ the rate of inhibition of transcription; and $g$ the rate of transcript
production while transcription is active at the locus. Differential expression
methods may test each of the parameters for differences across groups or only
one (often $g$).
All of these models may be further expanded to explicitly account for other sources of gene expression differences such as batch-effect or library depth depending on the particular DE algorithm.
All of these models may be further expanded to explicitly account for other
sources of gene expression differences such as batch-effect or library depth
depending on the particular DE algorithm.
__Exercise__: Vary the parameters of each distribution to explore how they affect the distribution of gene expression. How similar are the Poisson-Beta and Negative Binomial models?
__Exercise__: Vary the parameters of each distribution to explore how they
affect the distribution of gene expression. How similar are the Poisson-Beta and
Negative Binomial models?
......@@ -3,7 +3,7 @@ output: html_document
---
```{r setup, echo=FALSE}
knitr::opts_chunk$set(out.width='90%', fig.align = 'center', echo=FALSE, eval=TRUE)
knitr::opts_chunk$set(out.width='90%', fig.align = 'center', echo=FALSE, eval=TRUE, warning=FALSE, message=FALSE)
knitr::opts_knit$set(root.dir = normalizePath(".."))
```
......@@ -80,7 +80,7 @@ plotRLE(
)
```
```{r norm-ours-sctransform-reads}
```{r norm-ours-sctransform-reads, results='hide'}
umi_sparse <- as(counts(reads.qc), "dgCMatrix")
### Genes expressed in at least 5 cells will be kept
sctnorm_data <- sctransform::vst(umi = umi_sparse, min_cells = 1,
......
......@@ -459,13 +459,15 @@ Note that (due to what looks like a bug in this version of `sctransform`) we
need to convert the UMI count matrix to a sparse format to apply sctransform.
```{r sctransform-apply, warning=FALSE, message=FALSE}
```{r sctransform-apply, warning=FALSE, message=FALSE, results='hide'}
umi_sparse <- as(counts(umi.qc), "dgCMatrix")
### Genes expressed in at least 5 cells will be kept
sctnorm_data <- sctransform::vst(umi = umi_sparse, min_cells = 1,
cell_attr = as.data.frame(colData(umi.qc)),
latent_var = "log10_total_counts_endogenous")
```
```{r sctransform-add-to-sce, warning=FALSE, message=FALSE}
## Pearson residuals, or deviance residuals
dim(sctnorm_data$y)
dim(umi.qc)
......
......@@ -346,7 +346,7 @@ We demonstrate the usage of two of these doublet detection tools.
### scds
`scds`[@Bais2019-hf] has two detection methods:
`scds`[@Bais2019-wv] has two detection methods:
1) co-expression based;
2) binary-classification based.
......
......@@ -363,12 +363,16 @@ show below.
First, we run `sctransform` as we did previously.
```{r sctransform-apply, warning=FALSE, message=FALSE}
```{r sctransform-apply, warning=FALSE, message=FALSE, results='hide'}
deng_sparse <- as(counts(deng), "dgCMatrix")
### Genes expressed in at least 5 cells will be kept
sctnorm_data <- sctransform::vst(umi = deng_sparse, min_cells = 1,
cell_attr = as.data.frame(colData(deng)),
latent_var = "log10_total_counts_endogenous")
```
```{r}
sctnorm_data$model_str
```
......@@ -377,7 +381,8 @@ sctnorm_data$model_str
library(ggplot2)
ggplot(sctnorm_data$gene_attr, aes(residual_variance)) +
geom_histogram(binwidth=0.1) +
geom_vline(xintercept=1, color='red') + xlim(0, 10)
geom_vline(xintercept=1, color='red') + xlim(0, 10) +
theme_bw()
sctnorm_data$gene_attr$label <- rownames(sctnorm_data$gene_attr)
ggplot(sctnorm_data$gene_attr, aes(x = gmean, y=residual_variance)) +
......@@ -458,7 +463,8 @@ M3DropExpressionHeatmap(
)
```
We can also consider how consistent each feature selection method is with the others using the Jaccard Index:
We can also consider how consistent each feature selection method is with the
others using the Jaccard Index:
```{r}
J <- sum(M3Drop_genes %in% HVG_genes)/length(unique(c(M3Drop_genes, HVG_genes)))
......@@ -470,7 +476,7 @@ Plot the expression of the features for each of the other methods. Which appear
to be differentially expressed? How consistent are the different methods for
this dataset?
```{r, fig.width = 7, fig.height = 10}
```{r, eval=FALSE, include=FALSE, fig.width = 7, fig.height = 10}
M3DropExpressionHeatmap(
DANB_genes,
expr_matrix,
......
......@@ -21,8 +21,8 @@ These observed zero values can represent either missing data (i.e.~a gene is exp
The term ``dropout'' is often used to denote observed zero values in scRNA-seq data, but this term conflates zero values attributable to methodological noise and biologically-true zero expression, so we recommend against its use as a catch-all term for observed zeros.
Sparsity in scRNA-seq data can hinder downstream analyses, but it is challenging to model or handle it appropriately, and thus, there remains an ongoing need for improved methods.
Sparsity pervades all aspects of scRNA-seq data analysis, but here we focus on the linked problems of learning latent spaces and ``imputing'' expression values from scRNA-seq data (\autoref{fig:denoising-imputation}).
Imputation, ``data smoothing'' and ``data reconstruction'' approaches are closely linked to the challenges of normalization.
Sparsity pervades all aspects of scRNA-seq data analysis, but here we focus on the linked problems of learning latent spaces and "imputing" expression values from scRNA-seq data.
Imputation, "data smoothing" and "data reconstruction" approaches are closely linked to the challenges of normalization.
But whereas normalization generally aims to make expression values between cells more comparable to each other, imputation and data smoothing approaches aim to achieve adjusted data values that---it is hoped---better represent the true expression values.
Imputation methods could therefore be used for normalization, but do not entail all possible or useful approaches to normalization.
......@@ -33,9 +33,9 @@ The imputation of missing values has been very successful for genotype data.
Crucially, when imputing genotypes we often know which data are missing (e.g.~when no genotype call is possible due to no coverage of a locus, although see section \autoref{sec:dna-variation-calling} for the challenges with \ac{scdnaseq} data) and rich sources of external information are available (e.g.~haplotype reference panels).
Thus, genotype imputation is now highly accurate and a commonly-used step in data processing for genetic association studies \citep{Das2018-zs}.
The situation is somewhat different for scRNA-seq data, as we do not routinely have external reference information to apply (see \autoref{sec:rna-ref-atlases}).
In addition, we can never be sure which observed zeros represent ``missing data'' and which accurately represent a true gene expression level in the cell \citep{hicks_missing_2018}.
Observed zeros can either represent ``biological'' zeros, i.e.~those present because the true expression level of a gene in a cell was zero.
The situation is somewhat different for scRNA-seq data, as we do not routinely have external reference information to apply.
In addition, we can never be sure which observed zeros represent "missing data" and which accurately represent a true gene expression level in the cell \citep{hicks_missing_2018}.
Observed zeros can either represent "biological" zeros, i.e.~those present because the true expression level of a gene in a cell was zero.
Or they they are the result of methodological noise, which can arise when a gene has true non-zero expression in a cell, but no counts are observed due to failures at any point in the complicated process of processing mRNA transcripts in cells into mapped reads.
Such noise can lead to artefactual zero that are either more systematic (e.g.~sequence-specific mRNA degradation during cell lysis) or that occur by chance (e.g.~barely expressed transcripts that at the same expression level will sometimes be detected and sometimes not, due to sampling variation, e.g~in the sequencing).
The high degree of sparsity in scRNA-seq data therefore arises from technical zeros and true biological zeros, which are difficult to distinguish from one another.
......@@ -52,13 +52,13 @@ It is therefore desirable to improve both statistical methods that work on spars
We define three broad (and sometimes overlapping) categories of methods that can be used to ``impute'' scRNA-seq data in the absence of an external reference:
1. __Model-based imputation methods of technical zeros_ use probabilistic models to identify which observed zeros represent technical rather than biological zeros and aim to impute expression levels just for these technical zeros, leaving other observed expression levels untouched; or
1. __Data-smoothing methods_ define sets of ``similar'' cells (e.g.~cells that are neighbors in a graph or occupy a small region in a latent space) and adjust expression values for each cell based on expression values in similar cells.
1. __Model-based imputation methods of technical zeros__ use probabilistic models to identify which observed zeros represent technical rather than biological zeros and aim to impute expression levels just for these technical zeros, leaving other observed expression levels untouched; or
1. __Data-smoothing methods__ define sets of "similar" cells (e.g.~cells that are neighbors in a graph or occupy a small region in a latent space) and adjust expression values for each cell based on expression values in similar cells.
These methods adjust all expression values, including technical zeros, biological zeros and observed non-zero values.
1. __Data-reconstruction methods_ typically aim to define a latent space representation of the cells.
1. __Data-reconstruction methods__ typically aim to define a latent space representation of the cells.
This is often done through matrix factorization (e.g.~principal component analysis) or, increasingly, through machine learning approaches (e.g.~variational autoencoders that exploit deep neural networks to capture non-linear relationships).
Although a broad class of methods, both matrix factorization methods and autoencoders (among others) are able to ``reconstruct'' the observed data matrix from low-rank or simplified representations.
The reconstructed data matrix will typically no longer be sparse (with many zeros) and the implicitly ``imputed'' data can be used for downstream applications that cannot handle sparse count data.
Although a broad class of methods, both matrix factorization methods and autoencoders (among others) are able to "reconstruct" the observed data matrix from low-rank or simplified representations.
The reconstructed data matrix will typically no longer be sparse (with many zeros) and the implicitly "imputed" data can be used for downstream applications that cannot handle sparse count data.
The first category of methods generally seeks to infer a probabilistic model that captures the data generation mechanism.
Such generative models can be used to identify, probabilistically, which observed zeros correspond to technical zeros (to be imputed) and which correspond to biological zeros (to be left alone).
......@@ -76,9 +76,9 @@ Clustering methods that implicitly impute values, such as CIDR \citep{lin_cidr:_
<!-- \label{fig:denoising-imputation} -->
<!-- \end{figure*} -->
Data-smoothing methods, which adjust all gene expression levels based on expression levels in ``similar'' cells, have also been proposed to handle imputation problems.
We might regard these approaches as ``denoising'' methods.
To take a simplified example (\autoref{fig:denoising-imputation}), we might imagine that single cells originally refer to points in two-dimensional space, but are likely to describe a one-dimensional curve; projecting data points onto that curve eventually allows imputation of the ``missing'' values (but all points are adjusted, or smoothed, not just true technical zeros).
Data-smoothing methods, which adjust all gene expression levels based on expression levels in "similar" cells, have also been proposed to handle imputation problems.
We might regard these approaches as "denoising" methods.
To take a simplified example, we might imagine that single cells originally refer to points in two-dimensional space, but are likely to describe a one-dimensional curve; projecting data points onto that curve eventually allows imputation of the "missing" values (but all points are adjusted, or smoothed, not just true technical zeros).
Prominent data-smoothing approaches to handling sparse counts include:
- diffusion-based MAGIC \citep{dijk_recovering_2018}
......@@ -89,9 +89,9 @@ Prominent data-smoothing approaches to handling sparse counts include:
A major task in the analysis of high-dimensional single-cell data is to find low-dimensional representations of the data that capture the salient biological signals and render the data more interpretable and amenable to further analyses.
As it happens, the matrix factorization and latent-space learning methods used for that task also provide another route for imputation through their ability to \emph{reconstruct} the observed data matrix from simplified representations of it.
\Ac{pca} is one such standard matrix factorization method that can be applied to scRNA-seq data (preferably after suitable data normalization) as are other widely-used general statistical methods like \ac{ica} and \ac{nmf}.
As (linear) matrix factorization methods, \ac{pca}, \ac{ica} and \ac{nmf} decompose the observed data matrix into a ``small'' number of factors in two low-rank matrices, one representing cell-by-factor weights and one gene-by-factor loadings.
As it happens, the matrix factorization and latent-space learning methods used for that task also provide another route for imputation through their ability to _reconstruct_ the observed data matrix from simplified representations of it.
PCA is one such standard matrix factorization method that can be applied to scRNA-seq data (preferably after suitable data normalization) as are other widely-used general statistical methods like ICA and NMF.
As (linear) matrix factorization methods, PCA, ICA and NMF decompose the observed data matrix into a "small" number of factors in two low-rank matrices, one representing cell-by-factor weights and one gene-by-factor loadings.
Many matrix factorization methods with tweaks for single-cell data have been proposed in recent years, including:
- ZIFA, a zero-inflated factor analysis \citep{pierson_zifa:_2015}
......
---
title: "Analysis of single cell RNA-seq data"
author: "Davis McCarthy (<a href = 'https://twitter.com/davisjmcc'>davisjmcc</a>), Ruqian Lyu, PuXue Qiao, Vladimir Kiselev (<a href = 'https://twitter.com/wikiselev'>wikiselev</a>), Tallulah Andrews (<a href = 'https://twitter.com/talandrews'>talandrews</a>), Jennifer Westoby (<a href = 'https://twitter.com/Jenni_Westoby'>Jenni_Westoby</a>), Maren Büttner (<a href = 'https://twitter.com/marenbuettner'>marenbuettner</a>), Jimmy Lee (<a href = 'https://twitter.com/THJimmyLee'>THJimmyLee</a>), Krzysztof Polanski, Sebastian Y. Müller, Elo Madissoon, Stephane Ballereau, Maria Do Nascimento Lopes Primo, Rocio Martinez Nunez and Martin Hemberg (<a href = 'https://twitter.com/m_hemberg'>m_hemberg</a>)"
author: "Ruqian Lyu, PuXue Qiao, and Davis J. McCarthy (<a href = 'https://twitter.com/davisjmcc'>davisjmcc</a>)"
date: "`r Sys.Date()`"
#knit: "bookdown::render_book"
documentclass: book
......@@ -10,6 +10,18 @@ link-citations: yes
always_allow_html: yes
---
This version of the course builds on the May 2019 version of the course authored
by: Vladimir Kiselev (<a href = 'https://twitter.com/wikiselev'>wikiselev</a>),
Tallulah Andrews (<a href = 'https://twitter.com/talandrews'>talandrews</a>),
Davis J. McCarthy (<a href = 'https://twitter.com/davisjmcc'>davisjmcc</a>),
Jennifer Westoby (<a href =
'https://twitter.com/Jenni_Westoby'>Jenni_Westoby</a>), Maren Büttner (<a href =
'https://twitter.com/marenbuettner'>marenbuettner</a>), Jimmy Lee (<a href =
'https://twitter.com/THJimmyLee'>THJimmyLee</a>), Krzysztof Polanski, Sebastian
Y. Müller, Elo Madissoon, Stephane Ballereau, Maria Do Nascimento Lopes Primo,
Rocio Martinez Nunez and Martin Hemberg (<a href =
'https://twitter.com/m_hemberg'>m_hemberg</a>)
# About the course
<!-- > > <span style="color:red">__Important!__ The course will be run on the __2nd - 3rd October 2019, both days 9:00-17:00 Melbourne, Australia time__. </span> -->
......
......@@ -61,10 +61,32 @@ Today, there are also several different platforms available for carrying out one
## Challenges
The main difference between bulk and single cell RNA-seq is that each sequencing library represents a single cell, instead of a population of cells. Therefore, significant attention has to be paid to comparison of the results from different cells (sequencing libraries). The main sources of discrepancy between the libraries are:
The main difference between bulk and single cell RNA-seq is that each sequencing
library represents a single cell, instead of a population of cells. Therefore,
significant attention has to be paid to comparison of the results from different
cells (sequencing libraries). The main sources of discrepancy between the
libraries are:
* __Reverse transcription__ to convert RNA to cDNA is at best <30% efficient
* __Amplification__ (up to 1 million fold)
* __Gene 'dropouts'__ in which a gene is observed at a moderate expression level in one cell but is not detected in another cell [@Kharchenko2014-ts].
In both cases the discrepancies are introduced due to low starting amounts of transcripts since the RNA comes from one cell only. Improving the transcript capture efficiency and reducing the amplification bias are currently active areas of research. However, as we shall see in this course, it is possible to alleviate some of these issues through proper normalization and corrections.
* __Gene 'dropouts'__ in which a gene is observed at a moderate expression level in one cell but is not detected in another cell [@Kharchenko2014-ts]; this can be due to technical factors (e.g. inefficient RT) or true biological variability across cells.
These discrepancies are introduced due to low starting amounts of transcripts
since the RNA comes from one cell only. Improving the transcript capture
efficiency and reducing the amplification bias are currently active areas of
research. However, as we shall see in this course, it is possible to alleviate
some of these issues through proper normalization and corrections and effective
statistical models.
For the analyst, the characteristics of single-cell RNA-seq data lead to
challenges in handling:
* __Sparsity__
* __Variability__
* __Scalability__
* __Complexity__
In this workshop we will present computational approaches that can allow us to
face these challenges as we try to answer biological questions of interest from
single-cell transcriptomic data.
......@@ -3,10 +3,15 @@ output: html_document
---
```{r setup, echo=FALSE}
knitr::opts_chunk$set(fig.align = "center", eval = TRUE)
knitr::opts_chunk$set(fig.align = "center", eval = TRUE, warning=FALSE, message=FALSE)
knitr::opts_knit$set(root.dir = normalizePath(".."))
```
# Latent spaces
In many cases we may like to think of cells sitting in a low-dimensional,
"latent" space that captures relationships between cells more intuitively than
the very high-dimensional gene expression space.
```{r library, echo=TRUE}
library(scater)
......@@ -17,11 +22,6 @@ library(Polychrome)
library(slalom)
```
# Latent spaces
In many cases we may like to think of cells sitting in a low-dimensional,
"latent" space that captures relationships between cells more intuitively than
the very high-dimensional gene expression space.
## Dimensionality reduction
......@@ -517,8 +517,8 @@ model_deng <- trainSlalom(model_deng, nIterations = 1000, seed = 100, tolerance
View results:\
The `plotRelevance` function displays the most relevant terms (factors/pathways) ranked by relevance, showing gene set size and the number of genes gained/lost as active in the pathway as learnt by the model.
```{r, fig.width=10, fig.height=5}
plotRelevance(model_deng)
```{r, fig.width=14, fig.height=7}
plotRelevance(model_deng) + theme_classic(base_size = 8)
```
The `plotTerms` function shows the relevance of all terms in the model, enabling the identification of the most important pathways in the context of all that were included in the model.
```{r}
......@@ -541,9 +541,9 @@ So we want the to find the parameters $\theta$ such that the probability to gene
- __How do we define $Z$?__\
-__The simpliest idea:__ $Z \sim N(0, 1)$.
- __The simplest idea:__ $Z \sim N(0, 1)$.
It is not impossible, because "any distribution in d dimensions can be generated by taking a set of d variables that are normally distributed and mapping them through a sufficiently complicated function. "\
-__A better idea:__
-__A better idea:__
For most of $z$, $P(X|z; \theta)$ will be close to zero, meaning it contribute almost nothing to the estimate of $P(X)$. Thus, we want to sample only those values of $Z$ that are likely to produce $X$.
Denote this distribution of $Z$ as $Q(Z|X)$ (it is infered and therefore depend on $X$).\
__Advantage:__ There will be a lot less possible values of $Z$ under $Q$ compared to random sampling, therefore, it will be easier to compute $E_{Z \sim Q} P(X|Z)$.
......
......@@ -136,7 +136,7 @@ As the plot above shows, PC1 struggles to correctly order cells early and late i
Can bespoke pseudotime methods do better than naive application of PCA?
### TSCAN
## TSCAN
TSCAN [@tscam_rpkg] combines clustering with pseudotime analysis. First it clusters the cells using `mclust`, which is based on a mixture of normal distributions. Then it builds a minimum spanning tree to connect the clusters. The branch of this tree that connects the largest number of clusters is the main branch which is used to determine pseudotime.
......@@ -181,7 +181,7 @@ TSCAN gets the development trajectory the "wrong way around", in the sense that
__Exercise 1__ Compare results for different numbers of clusters (`clusternum`).
### Slingshot
## Slingshot
`Slingshot` [@Street2018-ac] is a single-cell lineage inference tool, it can work with datasets with multiple branches. Slingshot has two stages: 1) the inference of the global lineage structure using MST on clustered data points and 2) the inference of pseudotime variables for cells along each lineage by fitting simultaneous 'principal curves' across multiple lineages.
......@@ -296,7 +296,7 @@ heatmap(heatdata, Colv = NA,
We will regress each gene on the pseudotime variable we have generated, using a general additive model (GAM). This allows us to detect non-linear