From 44fe16997fba871feb16f0828d4b5e44d723bf7a Mon Sep 17 00:00:00 2001 From: Puxue Qiao <pqiao@svi.edu.au> Date: Tue, 1 Oct 2019 19:08:25 +1000 Subject: [PATCH] add references and minor adjustments --- course_files/book.bib | 134 +++++++++++++++++++++++++++++++++ course_files/clust-intro.Rmd | 32 +++++--- course_files/clustering.Rmd | 32 ++++---- course_files/latent-spaces.Rmd | 16 ++-- 4 files changed, 179 insertions(+), 35 deletions(-) diff --git a/course_files/book.bib b/course_files/book.bib index 0af2137..5244f1a 100644 --- a/course_files/book.bib +++ b/course_files/book.bib @@ -1127,3 +1127,137 @@ doi = {10.18637/jss.v059.i10} language = "en", doi = "10.1101/574574" } + + +@article{xu2015identification, + title={Identification of cell types from single-cell transcriptomes using a novel clustering method}, + author={Xu, Chen and Su, Zhengchang}, + journal={Bioinformatics}, + volume={31}, + number={12}, + pages={1974--1980}, + year={2015}, + publisher={Oxford University Press} +} + + +@article{newman2004finding, + title={Finding and evaluating community structure in networks}, + author={Newman, Mark EJ and Girvan, Michelle}, + journal={Physical review E}, + volume={69}, + number={2}, + pages={026113}, + year={2004}, + publisher={APS} +} + +@article{blondel2008fast, + title={Fast unfolding of communities in large networks}, + author={Blondel, Vincent D and Guillaume, Jean-Loup and Lambiotte, Renaud and Lefebvre, Etienne}, + journal={Journal of statistical mechanics: theory and experiment}, + volume={2008}, + number={10}, + pages={P10008}, + year={2008}, + publisher={IOP Publishing} +} + +@article{traag2019louvain, + title={From Louvain to Leiden: guaranteeing well-connected communities}, + author={Traag, Vincent A and Waltman, Ludo and van Eck, Nees Jan}, + journal={Scientific reports}, + volume={9}, + year={2019}, + publisher={Nature Publishing Group} +} + +@article{good2010performance, + title={Performance of modularity maximization in practical contexts}, + author={Good, Benjamin H and De Montjoye, Yves-Alexandre and Clauset, Aaron}, + journal={Physical Review E}, + volume={81}, + number={4}, + pages={046106}, + year={2010}, + publisher={APS} +} + +@article{freytag2018comparison, + title={Comparison of clustering tools in R for medium-sized 10x Genomics single-cell RNA-sequencing data}, + author={Freytag, Saskia and Tian, Luyi and L{\"o}nnstedt, Ingrid and Ng, Milica and Bahlo, Melanie}, + journal={F1000Research}, + volume={7}, + year={2018}, + publisher={Faculty of 1000 Ltd} +} + +@inproceedings{collins2002generalization, + title={A generalization of principal components analysis to the exponential family}, + author={Collins, Michael and Dasgupta, Sanjoy and Schapire, Robert E}, + booktitle={Advances in neural information processing systems}, + pages={617--624}, + year={2002} +} + +@inproceedings{hinton2003stochastic, + title={Stochastic neighbor embedding}, + author={Hinton, Geoffrey E and Roweis, Sam T}, + booktitle={Advances in neural information processing systems}, + pages={857--864}, + year={2003} +} + +@article{maaten2008visualizing, + title={Visualizing data using t-SNE}, + author={Maaten, Laurens van der and Hinton, Geoffrey}, + journal={Journal of machine learning research}, + volume={9}, + number={Nov}, + pages={2579--2605}, + year={2008} +} + +@article{moon2017phate, + title={PHATE: a dimensionality reduction method for visualizing trajectory structures in high-dimensional biological data}, + author={Moon, Kevin R and van Dijk, David and Wang, Zheng and Chen, William and Hirn, Matthew J and Coifman, Ronald R and Ivanova, Natalia B and Wolf, Guy and Krishnaswamy, Smita}, + journal={bioRxiv}, + pages={120378}, + year={2017}, + publisher={Cold Spring Harbor Laboratory} +} + +@article{buettner2017f, + title={f-scLVM: scalable and versatile factor analysis for single-cell RNA-seq}, + author={Buettner, Florian and Pratanwanich, Naruemon and McCarthy, Davis J and Marioni, John C and Stegle, Oliver}, + journal={Genome biology}, + volume={18}, + number={1}, + pages={212}, + year={2017}, + publisher={BioMed Central} +} + +@article{kingma2013auto, + title={Auto-encoding variational bayes}, + author={Kingma, Diederik P and Welling, Max}, + journal={arXiv preprint arXiv:1312.6114}, + year={2013} +} + + +@article{mcinnes2018umap, + title={Umap: Uniform manifold approximation and projection for dimension reduction}, + author={McInnes, Leland and Healy, John and Melville, James}, + journal={arXiv preprint arXiv:1802.03426}, + year={2018} +} + +@article{townes2019feature, + title={Feature Selection and Dimension Reduction for Single Cell RNA-Seq based on a Multinomial Model}, + author={Townes, F William and Hicks, Stephanie C and Aryee, Martin J and Irizarry, Rafael A}, + journal={bioRxiv}, + pages={574574}, + year={2019}, + publisher={Cold Spring Harbor Laboratory} +} \ No newline at end of file diff --git a/course_files/clust-intro.Rmd b/course_files/clust-intro.Rmd index 47b126b..f91ed43 100644 --- a/course_files/clust-intro.Rmd +++ b/course_files/clust-intro.Rmd @@ -108,7 +108,7 @@ to scRNA-seq data by building a graph where each vertice represents a cell and (weight of) the edge measures similarity between two cells. Actually, graph-based clustering is the most popular clustering algorithm in scRNA-seq data analysis, and has been reported to have outperformed other -clustering methods in many situations (ref). +clustering methods in many situations [@freytag2018comparison]. ##### Why do we want to represent the data as a graph?\ @@ -123,12 +123,12 @@ clustering methods in many situations (ref). - __Step2__: Add weights, and obtain a shared nearest neighbour (__SNN__) graph -<center>{width= 4%}</center> +<center>{width=40%}</center> There are two ways of adding weights: number and rank.\ - _number_: The number of shared nodes between $u$ and $v$, in this case, 3. \ -- _rank_: A measurement of the closeness to their common nearest neighbours. (ref) \ +- _rank_: A measurement of the closeness to their common nearest neighbours. (@xu2015identification) \ <font color="#bf812d"> @@ -145,28 +145,37 @@ $$ w(u, v) = K - s(u, v).$$ ##### Quality function (Modularity)\ -Modularity is not the only quality function for graph-based clustering, +Modularity [@newman2004finding] is not the only quality function for graph-based clustering, but it is one of the first attempts to embed in a compact form many questions including -<font color="red"> ... </font>.\ +the definition of quality function and null model etc.\ __The idea of modularity__: A random graph should not have a cluster structure. \ The more "quality" a partition has compared to a random graph, the "better" the partition is.\ Specifically, it is defined by: the <font color="#bf812d"> quality </font> of a partition on the actual graph $-$ the quality of the same partition on a <font color="#bf812d"> random graph </font> - + <font color="#bf812d"> quality </font>: Sum of the weights within clusters \ <font color="#bf812d"> random graph </font>: a copy of the original graph, with some of its properties, but without community structure. The random graph defined by modularity is: each node has the same degree as the original graph. - $$ Q \propto \sum_{i, j} A_{i, j} \delta(i, j) - \sum_{i, j} \dfrac{k_i k_j}{2m} \delta(i, j)$$ -<font color="red"> [notations] </font> + $$ Q \propto \sum_{i, j} A_{i, j} \delta(i, j) - \sum_{i, j} \dfrac{k_i k_j}{2m} \delta(i, j)$$ + +- $A_{i, j}$: weight between node $i$ and $j$; + +- $\delta(i, j)$: indicator of whether $i$ and $j$ are in the same cluster; + +- $k_i$: the degree of node $i$ (the sum of weights of all edges connected to $i$); + +- $m$: the total weight in the all graph. + + __Higher modularity implies better partition__: <center>{width=80%}</center> -__Limits of modularity__: \ +__Limits of modularity__: [@good2010performance]\ 1. Resolution limit. \ Short version: Modularity maximization forces small communities into larger ones. \ Longer version: For two clusters $A$ and $B$, if $k_A k_B < 2m$ then modularity increases by merging A and B into a single cluster, even if A and B are distinct clusters.\ @@ -182,12 +191,13 @@ __Limits of modularity__: \ Modularity-based clustering methods implemented in single cell analysis are mostly greedy algorithms, that are very fast, although not the most accurate approaches. - __Louvain__: + __Louvain__: [@blondel2008fast] <center>{width=80%}</center> - __Leiden__: Improved Louvain, hybrid of greedy algorithm and sampling technique \ + __Leiden__:[@traag2019louvain] \ + Improved Louvain, hybrid of greedy algorithm and sampling technique \ ##### __Advantages__: \ -Fast \ diff --git a/course_files/clustering.Rmd b/course_files/clustering.Rmd index 48d434c..89cc2a7 100644 --- a/course_files/clustering.Rmd +++ b/course_files/clustering.Rmd @@ -59,14 +59,14 @@ Perform Louvain clustering: ```{r clustering} cl <- igraph::cluster_louvain(deng15)$membership colData(deng)$cl <- factor(cl) -mclust::adjustedRandIndex(colData(deng)$cell_type1, colData(deng)$cl) +mclust::adjustedRandIndex(colData(deng)$cell_type2, colData(deng)$cl) ``` Reaches very high similarity with the labels provided in the original paper. However, it tend to merge small clusters into larger ones. ```{r} -table(deng$cell_type1, cl) +table(deng$cell_type2, cl) ``` @@ -94,19 +94,15 @@ table(muraro$cell_type1, cl) Let's run `SC3` clustering on the Deng data. The advantage of the `SC3` is that it can directly ingest a `SingleCellExperiment` object. -Now let's image we do not know the number of clusters _k_ (cell types). `SC3` can estimate a number of clusters for you: -```{r, eval= F} +`SC3` can estimate a number of clusters: +```{r} deng <- sc3_estimate_k(deng) metadata(deng)$sc3$k_estimation ``` -Interestingly, the number of cell types predicted by `SC3` is smaller than in the original data annotation. However, early, mid and late stages of different cell types together, we will have exactly 6 cell types. We store the merged cell types in `cell_type1` column of the `colData` slot: -```{r, eval= F} -plotPCA(deng, colour_by = "cell_type1") -``` -Now we are ready to run `SC3` (we also ask it to calculate biological properties of the clusters): -```{r, eval= F} +Next we run `SC3` (we also ask it to calculate biological properties of the clusters): +```{r} deng <- sc3(deng, ks = 10, biology = TRUE, n_cores = 1) ``` @@ -118,27 +114,27 @@ sc3_plot_consensus(deng, k = 10, show_pdata = "cell_type2") ``` Silhouette plot: -```{r, fig.height=9, eval= F} +```{r, fig.height=9} sc3_plot_silhouette(deng, k = 10) ``` Heatmap of the expression matrix: -```{r, fig.height=6, eval= F} +```{r, fig.height=6} sc3_plot_expression(deng, k = 10, show_pdata = "cell_type2") ``` Identified marker genes: -```{r, fig.height=11, eval= F} +```{r, fig.height=11} sc3_plot_markers(deng, k = 10, show_pdata = "cell_type2") ``` PCA plot with highlighted `SC3` clusters: -```{r, eval= F} +```{r} plotPCA(deng, colour_by = "sc3_10_clusters") ``` Compare the results of `SC3` clustering with the original publication cell type labels: -```{r, eval= F} +```{r} adjustedRandIndex(colData(deng)$cell_type2, colData(deng)$sc3_10_clusters) ``` @@ -176,7 +172,9 @@ __Note__ Due to direct calculation of distances `SC3` becomes very slow when the <center> {width=80%} </center> - __Step4. Fine tuning__\ We stop here and assign each cell with label that score the highest, actually, if we set the argument ```fine.tune = FALSE```, that is exactly what the package function ```SingleR``` does. - But there is one more question, what if the second highest score is very close to the highest? + But there is one more question, what if the second highest score is very close to the highest? say, 1, 1, 1, 9.5, 10. + `SingleR` set a threshold to define how close is "very close", the default is 0.05. + For (only) the cells that falls into this category, it goes back to Step2. #### Example @@ -314,4 +312,4 @@ plot( sessionInfo() ``` -Among the 2126 cells in the data, only 89 are annotated as different labels as the + diff --git a/course_files/latent-spaces.Rmd b/course_files/latent-spaces.Rmd index ec109be..922221b 100644 --- a/course_files/latent-spaces.Rmd +++ b/course_files/latent-spaces.Rmd @@ -107,7 +107,9 @@ plotPCA(deng, colour_by = "cell_type2") + non-linear dependencies. For instance, PCA would not be able to “unroll†the following structure.\ <center> {width=30%} </center> -#### GLM-PCA +#### [GLM-PCA](https://rdrr.io/cran/glmpca/) +[@collins2002generalization] +[@townes2019feature] GLM-PCA is a generalized version of the traditional PCA. @@ -217,8 +219,7 @@ ggplot(pd, aes(x=dim1, y=dim2, shape=clust, colour=batch)) + ### tSNE: t-Distributed Stochastic Neighbor Embedding -t-SNE is an advanced version of the original SNE algorithm. <font color="red"> -[ref] </font> +t-SNE [@maaten2008visualizing] is an advanced version of the original SNE algorithm. [@hinton2003stochastic] #### Motivation @@ -321,7 +322,7 @@ Therefore can merely be used for visualization.\ ### Manifold methods -#### UMAP: Uniform Manifold Approximation and Projection +#### UMAP: Uniform Manifold Approximation and Projection [@mcinnes2018umap] ##### __Advantages of UMAP over t-SNE:__ @@ -368,7 +369,7 @@ plotUMAP(muraro, colour_by="cell_type1") -#### PHATE +#### PHATE [@moon2017phate] ##### Sketch of algorithm @@ -450,7 +451,7 @@ The __factor loadings__ or weights indicate how much each latent factor is affec ### [Slalom](https://bioconductor.org/packages/release/bioc/html/slalom.html): Interpretable latent spaces -Highlight of Slalom: +Highlight of Slalom: [@buettner2017f] - It incorporates prior information to help the model estimation; @@ -524,7 +525,8 @@ The `plotTerms` function shows the relevance of all terms in the model, enabling plotTerms(model_deng) ``` -## Autoencoders +## Autoencoders +[@kingma2013auto] <center>{width=80%}</center> -- GitLab