@@ -1127,3 +1127,137 @@ doi = {10.18637/jss.v059.i10}

language="en",

doi="10.1101/574574"

}

@article{xu2015identification,

title={Identification of cell types from single-cell transcriptomes using a novel clustering method},

author={Xu, Chen and Su, Zhengchang},

journal={Bioinformatics},

volume={31},

number={12},

pages={1974--1980},

year={2015},

publisher={Oxford University Press}

}

@article{newman2004finding,

title={Finding and evaluating community structure in networks},

author={Newman, Mark EJ and Girvan, Michelle},

journal={Physical review E},

volume={69},

number={2},

pages={026113},

year={2004},

publisher={APS}

}

@article{blondel2008fast,

title={Fast unfolding of communities in large networks},

author={Blondel, Vincent D and Guillaume, Jean-Loup and Lambiotte, Renaud and Lefebvre, Etienne},

journal={Journal of statistical mechanics: theory and experiment},

volume={2008},

number={10},

pages={P10008},

year={2008},

publisher={IOP Publishing}

}

@article{traag2019louvain,

title={From Louvain to Leiden: guaranteeing well-connected communities},

author={Traag, Vincent A and Waltman, Ludo and van Eck, Nees Jan},

journal={Scientific reports},

volume={9},

year={2019},

publisher={Nature Publishing Group}

}

@article{good2010performance,

title={Performance of modularity maximization in practical contexts},

author={Good, Benjamin H and De Montjoye, Yves-Alexandre and Clauset, Aaron},

journal={Physical Review E},

volume={81},

number={4},

pages={046106},

year={2010},

publisher={APS}

}

@article{freytag2018comparison,

title={Comparison of clustering tools in R for medium-sized 10x Genomics single-cell RNA-sequencing data},

author={Freytag, Saskia and Tian, Luyi and L{\"o}nnstedt, Ingrid and Ng, Milica and Bahlo, Melanie},

journal={F1000Research},

volume={7},

year={2018},

publisher={Faculty of 1000 Ltd}

}

@inproceedings{collins2002generalization,

title={A generalization of principal components analysis to the exponential family},

author={Collins, Michael and Dasgupta, Sanjoy and Schapire, Robert E},

booktitle={Advances in neural information processing systems},

pages={617--624},

year={2002}

}

@inproceedings{hinton2003stochastic,

title={Stochastic neighbor embedding},

author={Hinton, Geoffrey E and Roweis, Sam T},

booktitle={Advances in neural information processing systems},

pages={857--864},

year={2003}

}

@article{maaten2008visualizing,

title={Visualizing data using t-SNE},

author={Maaten, Laurens van der and Hinton, Geoffrey},

journal={Journal of machine learning research},

volume={9},

number={Nov},

pages={2579--2605},

year={2008}

}

@article{moon2017phate,

title={PHATE: a dimensionality reduction method for visualizing trajectory structures in high-dimensional biological data},

author={Moon, Kevin R and van Dijk, David and Wang, Zheng and Chen, William and Hirn, Matthew J and Coifman, Ronald R and Ivanova, Natalia B and Wolf, Guy and Krishnaswamy, Smita},

journal={bioRxiv},

pages={120378},

year={2017},

publisher={Cold Spring Harbor Laboratory}

}

@article{buettner2017f,

title={f-scLVM: scalable and versatile factor analysis for single-cell RNA-seq},

author={Buettner, Florian and Pratanwanich, Naruemon and McCarthy, Davis J and Marioni, John C and Stegle, Oliver},

journal={Genome biology},

volume={18},

number={1},

pages={212},

year={2017},

publisher={BioMed Central}

}

@article{kingma2013auto,

title={Auto-encoding variational bayes},

author={Kingma, Diederik P and Welling, Max},

journal={arXiv preprint arXiv:1312.6114},

year={2013}

}

@article{mcinnes2018umap,

title={Umap: Uniform manifold approximation and projection for dimension reduction},

author={McInnes, Leland and Healy, John and Melville, James},

journal={arXiv preprint arXiv:1802.03426},

year={2018}

}

@article{townes2019feature,

title={Feature Selection and Dimension Reduction for Single Cell RNA-Seq based on a Multinomial Model},

author={Townes, F William and Hicks, Stephanie C and Aryee, Martin J and Irizarry, Rafael A},

@@ -108,7 +108,7 @@ to scRNA-seq data by building a graph where each vertice represents a cell

and (weight of) the edge measures similarity between two cells.

Actually, graph-based clustering is the most popular clustering algorithm in

scRNA-seq data analysis, and has been reported to have outperformed other

clustering methods in many situations (ref).

clustering methods in many situations [@freytag2018comparison].

##### Why do we want to represent the data as a graph?\

...

...

@@ -123,12 +123,12 @@ clustering methods in many situations (ref).

- __Step2__: Add weights, and obtain a shared nearest neighbour (__SNN__) graph

<center>![](figures/SNN.jpg){width=4%}</center>

<center>![](figures/SNN.jpg){width=40%}</center>

There are two ways of adding weights: number and rank.\

- _number_: The number of shared nodes between $u$ and $v$, in this case, 3. \

- _rank_: A measurement of the closeness to their common nearest neighbours. (ref) \

- _rank_: A measurement of the closeness to their common nearest neighbours. (@xu2015identification) \

<font color="#bf812d">

...

...

@@ -145,28 +145,37 @@ $$ w(u, v) = K - s(u, v).$$

##### Quality function (Modularity)\

Modularity is not the only quality function for graph-based clustering,

Modularity [@newman2004finding] is not the only quality function for graph-based clustering,

but it is one of the first attempts to embed in a compact form many questions including

<font color="red"> ... </font>.\

the definition of quality function and null model etc.\

__The idea of modularity__: A random graph should not have a cluster structure. \

The more "quality" a partition has compared to a random graph, the "better" the partition is.\

Specifically, it is defined by:

the <font color="#bf812d"> quality </font> of a partition on the actual graph $-$ the quality of the same partition on a <font color="#bf812d"> random graph </font>

<font color="#bf812d"> quality </font>: Sum of the weights within clusters \

<font color="#bf812d"> random graph </font>: a copy of the original graph, with some of its properties, but without community structure. The random graph defined by modularity is: each node has the same degree as the original graph.

Short version: Modularity maximization forces small communities into larger ones. \

Longer version: For two clusters $A$ and $B$, if $k_A k_B < 2m$ then modularity increases by merging A and B into a single cluster, even if A and B are distinct clusters.\

...

...

@@ -182,12 +191,13 @@ __Limits of modularity__: \

Modularity-based clustering methods implemented in single cell analysis are mostly greedy algorithms,

that are very fast, although not the most accurate approaches.

Reaches very high similarity with the labels provided in the original paper.

However, it tend to merge small clusters into larger ones.

```{r}

table(deng$cell_type1, cl)

table(deng$cell_type2, cl)

```

...

...

@@ -94,19 +94,15 @@ table(muraro$cell_type1, cl)

Let's run `SC3` clustering on the Deng data. The advantage of the `SC3` is that it can directly ingest a `SingleCellExperiment` object.

Now let's image we do not know the number of clusters _k_ (cell types). `SC3` can estimate a number of clusters for you:

```{r, eval= F}

`SC3` can estimate a number of clusters:

```{r}

deng <- sc3_estimate_k(deng)

metadata(deng)$sc3$k_estimation

```

Interestingly, the number of cell types predicted by `SC3` is smaller than in the original data annotation. However, early, mid and late stages of different cell types together, we will have exactly 6 cell types. We store the merged cell types in `cell_type1` column of the `colData` slot:

```{r, eval= F}

plotPCA(deng, colour_by = "cell_type1")

```

Now we are ready to run `SC3` (we also ask it to calculate biological properties of the clusters):

```{r, eval= F}

Next we run `SC3` (we also ask it to calculate biological properties of the clusters):

We stop here and assign each cell with label that score the highest, actually, if we set the argument ```fine.tune = FALSE```, that is exactly what the package function ```SingleR``` does.

But there is one more question, what if the second highest score is very close to the highest?

But there is one more question, what if the second highest score is very close to the highest? say, 1, 1, 1, 9.5, 10.

`SingleR` set a threshold to define how close is "very close", the default is 0.05.

For (only) the cells that falls into this category, it goes back to Step2.

#### Example

...

...

@@ -314,4 +312,4 @@ plot(

sessionInfo()

```

Among the 2126 cells in the data, only 89 are annotated as different labels as the