@@ -1129,9 +1129,193 @@ doi = {10.18637/jss.v059.i10}

}

@ARTICLE{Bais2019-hf,

title="scds: Computational Annotation of Doublets in {Single-Cell} {RNA}

Sequencing Data",

@article{newman2004finding,

title={Finding and evaluating community structure in networks},

author={Newman, Mark EJ and Girvan, Michelle},

journal={Physical review E},

volume={69},

number={2},

pages={026113},

year={2004},

publisher={APS}

}

@article{blondel2008fast,

title={Fast unfolding of communities in large networks},

author={Blondel, Vincent D and Guillaume, Jean-Loup and Lambiotte, Renaud and Lefebvre, Etienne},

journal={Journal of statistical mechanics: theory and experiment},

volume={2008},

number={10},

pages={P10008},

year={2008},

publisher={IOP Publishing}

}

@article{traag2019louvain,

title={From Louvain to Leiden: guaranteeing well-connected communities},

author={Traag, Vincent A and Waltman, Ludo and van Eck, Nees Jan},

journal={Scientific reports},

volume={9},

year={2019},

publisher={Nature Publishing Group}

}

@article{good2010performance,

title={Performance of modularity maximization in practical contexts},

author={Good, Benjamin H and De Montjoye, Yves-Alexandre and Clauset, Aaron},

journal={Physical Review E},

volume={81},

number={4},

pages={046106},

year={2010},

publisher={APS}

}

@article{freytag2018comparison,

title={Comparison of clustering tools in R for medium-sized 10x Genomics single-cell RNA-sequencing data},

author={Freytag, Saskia and Tian, Luyi and L{\"o}nnstedt, Ingrid and Ng, Milica and Bahlo, Melanie},

journal={F1000Research},

volume={7},

year={2018},

publisher={Faculty of 1000 Ltd}

}

@inproceedings{collins2002generalization,

title={A generalization of principal components analysis to the exponential family},

author={Collins, Michael and Dasgupta, Sanjoy and Schapire, Robert E},

booktitle={Advances in neural information processing systems},

pages={617--624},

year={2002}

}

@inproceedings{hinton2003stochastic,

title={Stochastic neighbor embedding},

author={Hinton, Geoffrey E and Roweis, Sam T},

booktitle={Advances in neural information processing systems},

pages={857--864},

year={2003}

}

@article{maaten2008visualizing,

title={Visualizing data using t-SNE},

author={Maaten, Laurens van der and Hinton, Geoffrey},

journal={Journal of machine learning research},

volume={9},

number={Nov},

pages={2579--2605},

year={2008}

}

@article{moon2017phate,

title={PHATE: a dimensionality reduction method for visualizing trajectory structures in high-dimensional biological data},

author={Moon, Kevin R and van Dijk, David and Wang, Zheng and Chen, William and Hirn, Matthew J and Coifman, Ronald R and Ivanova, Natalia B and Wolf, Guy and Krishnaswamy, Smita},

journal={bioRxiv},

pages={120378},

year={2017},

publisher={Cold Spring Harbor Laboratory}

}

@article{buettner2017f,

title={f-scLVM: scalable and versatile factor analysis for single-cell RNA-seq},

author={Buettner, Florian and Pratanwanich, Naruemon and McCarthy, Davis J and Marioni, John C and Stegle, Oliver},

journal={Genome biology},

volume={18},

number={1},

pages={212},

year={2017},

publisher={BioMed Central}

}

@article{kingma2013auto,

title={Auto-encoding variational bayes},

author={Kingma, Diederik P and Welling, Max},

journal={arXiv preprint arXiv:1312.6114},

year={2013}

}

@article{mcinnes2018umap,

title={Umap: Uniform manifold approximation and projection for dimension reduction},

author={McInnes, Leland and Healy, John and Melville, James},

journal={arXiv preprint arXiv:1802.03426},

year={2018}

}

@article{townes2019feature,

title={Feature Selection and Dimension Reduction for Single Cell RNA-Seq based on a Multinomial Model},

author={Townes, F William and Hicks, Stephanie C and Aryee, Martin J and Irizarry, Rafael A},

journal={bioRxiv},

pages={574574},

year={2019},

publisher={Cold Spring Harbor Laboratory}

}

@ARTICLE{Soneson2018-hy,

title="{Bias, robustness and scalability in single-cell differential

expression analysis}",

author="Soneson, Charlotte and Robinson, Mark D",

abstract="Many methods have been used to determine differential gene

expression from single-cell RNA (scRNA)-seq data. We evaluated

36 approaches using experimental and synthetic data and found

considerable differences in the number and characteristics of

the genes that are called differentially expressed. Prefiltering

of lowly expressed genes has important effects, particularly for

some of the methods developed for bulk RNA-seq data analysis.

However, we found that bulk RNA-seq analysis methods do not

generally perform worse than those developed specifically for

scRNA-seq. We also present conquer, a repository of consistently

processed, analysis-ready public scRNA-seq data sets that is

aimed at simplifying method evaluation and reanalysis of

published results. Each data set provides abundance estimates

for both genes and transcripts, as well as quality control and

exploratory analysis reports.",

journal="Nature methods",

publisher="Nature Publishing Group, a division of Macmillan Publishers

Limited. All Rights Reserved.",

month=feb,

year=2018,

url="http://dx.doi.org/10.1038/nmeth.4612",

issn="1548-7091",

doi="10.1038/nmeth.4612"

}

@ARTICLE{Finak2015-ow,

title="{MAST: a flexible statistical framework for assessing

transcriptional changes and characterizing heterogeneity in

single-cell RNA sequencing data}",

author="Finak, Greg and McDavid, Andrew and Yajima, Masanao and Deng,

Jingyuan and Gersuk, Vivian and Shalek, Alex K and Slichter,

Chloe K and Miller, Hannah W and McElrath, M Juliana and Prlic,

Martin and Linsley, Peter S and Gottardo, Raphael",

@@ -46,7 +46,7 @@ that it is typically much easier to visualize the data in a 2 or

* Scalability: in the last few years the number of cells in scRNA-seq experiments has grown by several orders of magnitude from ~$10^2$ to ~$10^6$

### unsupervised Clustering methods

### Unsupervised clustering methods

Three main ingredients of a complete clustering method:

...

...

@@ -108,7 +108,7 @@ to scRNA-seq data by building a graph where each vertice represents a cell

and (weight of) the edge measures similarity between two cells.

Actually, graph-based clustering is the most popular clustering algorithm in

scRNA-seq data analysis, and has been reported to have outperformed other

clustering methods in many situations (ref).

clustering methods in many situations [@freytag2018comparison].

##### Why do we want to represent the data as a graph?\

...

...

@@ -123,12 +123,12 @@ clustering methods in many situations (ref).

- __Step2__: Add weights, and obtain a shared nearest neighbour (__SNN__) graph

<center>![](figures/SNN.jpg){width=4%}</center>

<center>![](figures/SNN.jpg){width=40%}</center>

There are two ways of adding weights: number and rank.\

- _number_: The number of shared nodes between $u$ and $v$, in this case, 3. \

- _rank_: A measurement of the closeness to their common nearest neighbours. (ref) \

- _rank_: A measurement of the closeness to their common nearest neighbours. (@xu2015identification) \

<font color="#bf812d">

...

...

@@ -145,28 +145,37 @@ $$ w(u, v) = K - s(u, v).$$

##### Quality function (Modularity)\

Modularity is not the only quality function for graph-based clustering,

Modularity [@newman2004finding] is not the only quality function for graph-based clustering,

but it is one of the first attempts to embed in a compact form many questions including

<font color="red"> ... </font>.\

the definition of quality function and null model etc.\

__The idea of modularity__: A random graph should not have a cluster structure. \

The more "quality" a partition has compared to a random graph, the "better" the partition is.\

Specifically, it is defined by:

the <font color="#bf812d"> quality </font> of a partition on the actual graph $-$ the quality of the same partition on a <font color="#bf812d"> random graph </font>

<font color="#bf812d"> quality </font>: Sum of the weights within clusters \

<font color="#bf812d"> random graph </font>: a copy of the original graph, with some of its properties, but without community structure. The random graph defined by modularity is: each node has the same degree as the original graph.

Short version: Modularity maximization forces small communities into larger ones. \

Longer version: For two clusters $A$ and $B$, if $k_A k_B < 2m$ then modularity increases by merging A and B into a single cluster, even if A and B are distinct clusters.\

...

...

@@ -182,12 +191,13 @@ __Limits of modularity__: \

Modularity-based clustering methods implemented in single cell analysis are mostly greedy algorithms,

that are very fast, although not the most accurate approaches.

Reaches very high similarity with the labels provided in the original paper.

However, it tend to merge small clusters into larger ones.

```{r}

table(deng$cell_type1, cl)

table(deng$cell_type2, cl)

```

...

...

@@ -94,19 +94,15 @@ table(muraro$cell_type1, cl)

Let's run `SC3` clustering on the Deng data. The advantage of the `SC3` is that it can directly ingest a `SingleCellExperiment` object.

Now let's image we do not know the number of clusters _k_ (cell types). `SC3` can estimate a number of clusters for you:

```{r, eval= F}

`SC3` can estimate a number of clusters:

```{r}

deng <- sc3_estimate_k(deng)

metadata(deng)$sc3$k_estimation

```

Interestingly, the number of cell types predicted by `SC3` is smaller than in the original data annotation. However, early, mid and late stages of different cell types together, we will have exactly 6 cell types. We store the merged cell types in `cell_type1` column of the `colData` slot:

```{r, eval= F}

plotPCA(deng, colour_by = "cell_type1")

```

Now we are ready to run `SC3` (we also ask it to calculate biological properties of the clusters):

```{r, eval= F}

Next we run `SC3` (we also ask it to calculate biological properties of the clusters):

We stop here and assign each cell with label that score the highest, actually, if we set the argument ```fine.tune = FALSE```, that is exactly what the package function ```SingleR``` does.

But there is one more question, what if the second highest score is very close to the highest?

But there is one more question, what if the second highest score is very close to the highest? say, 1, 1, 1, 9.5, 10.

`SingleR` set a threshold to define how close is "very close", the default is 0.05.

For (only) the cells that falls into this category, it goes back to Step2.

#### Example

...

...

@@ -314,4 +312,4 @@ plot(

sessionInfo()

```

Among the 2126 cells in the data, only 89 are annotated as different labels as the

available where the RNA-seq data has been validated using

RT-qPCR. These data can be used to benchmark DE finding algorithms and the available evidence suggests that the algorithms are performing quite well.

available where the RNA-seq data has been validated using RT-qPCR. These data

can be used to benchmark DE finding algorithms and the available evidence

suggests that the algorithms are performing well.

### Single cell RNA-seq

In contrast to bulk RNA-seq, in scRNA-seq we usually do not have a defined

set of experimental conditions. Instead, as was shown in a previous chapter

In contrast to bulk RNA-seq, in scRNA-seq we often do not have a defined set

of experimental conditions. Instead, as was shown in a previous chapter

(\@ref(clust-methods)) we can identify the cell groups by using an unsupervised

clustering approach. Once the groups have been identified one can find differentially

expressed genes either by comparing the differences in variance between the groups (like the Kruskal-Wallis test implemented in SC3), or by comparing gene expression between clusters in a pairwise manner. In the following chapter we will mainly consider tools developed for pairwise comparisons.

clustering approach. Once the groups have been identified one can find

differentially expressed genes either by comparing the differences in variance

between the groups (like the Kruskal-Wallis test implemented in SC3), or by

comparing gene expression between clusters in a pairwise manner. In the

following chapter we will mainly consider tools developed for pairwise

comparisons.

These method may also be applied when comparing cells obtained from different

groups or conditions. Such analyses can be complicated by differing cell type

proportions between samples (i.e. distinct samples cell populations; the unit of

replication in the study). In such cases, it is likely beneficial to identify

distinct cell types and conduct differential expression testing between

conditions within each cell type.

### Differences in Distribution

Unlike bulk RNA-seq, we generally have a large number of samples (i.e. cells) for each group we are comparing in single-cell experiments. Thus we can take advantage of the whole distribution of expression values in each group to identify differences between groups rather than only comparing estimates of mean-expression as is standard for bulk RNASeq.

Unlike bulk RNA-seq, we generally have a large number of samples (i.e. cells)

for each group we are comparing in single-cell experiments. Thus we may be able

to take advantage of the whole distribution of expression values in each group

to identify differences between groups rather than only comparing estimates of

mean-expression as is standard for bulk RNASeq.

There are two main approaches to comparing distributions. Firstly, we can use

existing statistical models/distributions and fit the same type of model to the

expression in each group then test for differences in the parameters for each

model, or test whether the model fits better if a particular parameter is allowed

to be different according to group. For instance in Chapter

\@ref(dealing-with-confounders) we used `edgeR` to test whether allowing mean

expression to be different in different batches significantly improved the fit

of a negative binomial model of the data.

Alternatively, we can use a non-parametric test which does not assume that

expression values follow any particular distribution, e.g. the

Non-parametric tests generally convert observed expression values to ranks and

test whether the distribution of ranks for one group are signficantly different

from the distribution of ranks for the other group. However, some non-parametric

methods fail in the presence of a large number of tied values, such as the case

for dropouts (zeros) in single-cell RNA-seq expression data. Moreover, if the

conditions for a parametric test hold, then it will typically be more powerful

than a non-parametric test.

### Benchmarking of DE methods for scRNA-seq data

So far there has been one high-quality benchmarking study of single-cell

differential expression methods [@Soneson2018-hy]. The figure below summarises

the results from that paper (which is well worth reading in full!):

```{r de-benchmarking, out.width='90%', fig.cap="Figure 5 reproduced from Soneson and Robinson (2018). Summary of DE method performance across all major evaluation criteria. Criteria and cutoff values for performance categories are available in the Online Methods. Methods are ranked by their average performance across the criteria, with the numerical encoding good = 2, intermediate = 1, poor = 0. NODES and SAMseq do not return nominal P values and were therefore not evaluated in terms of the FPR."}

There are two main approaches to comparing distributions. Firstly, we can use existing statistical models/distributions and fit the same type of model to the expression in each group then test for differences in the parameters for each model, or test whether the model fits better if a particular paramter is allowed to be different according to group. For instance in Chapter \@ref(dealing-with-confounders) we used edgeR to test whether allowing mean expression to be different in different batches significantly improved the fit of a negative binomial model of the data.

One particularly surprising outcome of this benchmarking study is that almost

all methods designed specifically for the analysis of scRNA-seq data are

outperformed by established bulk RNA-seq DE methods (edgeR, limma) and standard,

[@Finak2015-ow] is the only method designed specifically for scRNA-seq data that

performs well in this benchmark. These benchmarking results are a credit to the

durability and flexibility of the leading bulk RNA-seq DE methods and a subtle

indictment of the land rush of new scRNA-seq methods that were published without

adequate comparison to existing bulk RNA-seq methods.

Alternatively, we can use a non-parametric test which does not assume that expression values follow any particular distribution, e.g. the [Kolmogorov-Smirnov test (KS-test)](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test). Non-parametric tests generally convert observed expression values to ranks and test whether the distribution of ranks for one group are signficantly different from the distribution of ranks for the other group. However, some non-parametric methods fail in the presence of a large number of tied values, such as the case for dropouts (zeros) in single-cell RNA-seq expression data. Moreover, if the conditions for a parametric test hold, then it will typically be more powerful than a non-parametric test.

### Models of single-cell RNASeq data

### Models of single-cell RNA-seq data

The most common model of RNASeq data is the negative binomial model:

...

...

@@ -67,9 +124,21 @@ $\mu = mu$

Variance:

$\sigma^2 = mu + mu^2/size$

It is parameterized by the mean expression (mu) and the dispersion (size), which is inversely related to the variance. The negative binomial model fits bulk RNA-seq data very well and it is used for most statistical methods designed for such data. In addition, it has been show to fit the distribution of molecule counts obtained from data tagged by unique molecular identifiers (UMIs) quite well ([Grun et al. 2014](http://www.nature.com/nmeth/journal/v11/n6/full/nmeth.2930.html), [Islam et al. 2011](http://genome.cshlp.org/content/21/7/1160)).

However, a raw negative binomial model does not fit full-length transcript data as well due to the high dropout rates relative to the non-zero read counts. For this type of data a variety of zero-inflated negative binomial models have been proposed (e.g. [MAST](https://bioconductor.org/packages/release/bioc/html/MAST.html), [SCDE](https://bioconductor.org/packages/release/bioc/html/scde.html)).

It is parameterized by the mean expression (mu) and the dispersion (size), which

is inversely related to the variance. The negative binomial model fits bulk

RNA-seq data very well and it is used for most statistical methods designed for

such data. In addition, it has been show to fit the distribution of molecule

counts obtained from data tagged by unique molecular identifiers (UMIs) quite

These models introduce a new parameter $d$, for the dropout rate, to the negative binomial model. As we saw in Chapter 19, the dropout rate of a gene is strongly correlated with the mean expression of the gene. Different zero-inflated negative binomial models use different relationships between mu and d and some may fit $\mu$ and $d$ to the expression of each gene independently.

Finally, several methods use a Poisson-Beta distribution which is based on a mechanistic model of transcriptional bursting. There is strong experimental support for this model ([Kim and Marioni, 2013](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2013-14-1-r7)) and it provides a good fit to scRNA-seq data but it is less easy to use than the negative-binomial models and much less existing methods upon which to build than the negative binomial model.

These models introduce a new parameter $d$, for the dropout rate, to the

negative binomial model. As we saw in Chapter 19, the dropout rate of a gene is

strongly correlated with the mean expression of the gene. Different

zero-inflated negative binomial models use different relationships between mu

and d and some may fit $\mu$ and $d$ to the expression of each gene

independently.

Finally, several methods use a Poisson-Beta distribution which is based on a

mechanistic model of transcriptional bursting. There is strong experimental

$\sigma^2 = g^2 \cdot a \cdot b/((a + b + 1) \cdot (a + b)^2)$

This model uses three parameters: $a$ the rate of activation of transcription; $b$ the rate of inhibition of transcription; and $g$ the rate of transcript production while transcription is active at the locus. Differential expression methods may test each of the parameters for differences across groups or only one (often $g$).

This model uses three parameters: $a$ the rate of activation of transcription;

$b$ the rate of inhibition of transcription; and $g$ the rate of transcript

production while transcription is active at the locus. Differential expression

methods may test each of the parameters for differences across groups or only

one (often $g$).

All of these models may be further expanded to explicitly account for other sources of gene expression differences such as batch-effect or library depth depending on the particular DE algorithm.

All of these models may be further expanded to explicitly account for other

sources of gene expression differences such as batch-effect or library depth

depending on the particular DE algorithm.

__Exercise__: Vary the parameters of each distribution to explore how they affect the distribution of gene expression. How similar are the Poisson-Beta and Negative Binomial models?

__Exercise__: Vary the parameters of each distribution to explore how they

affect the distribution of gene expression. How similar are the Poisson-Beta and