From 44fe16997fba871feb16f0828d4b5e44d723bf7a Mon Sep 17 00:00:00 2001
From: Puxue Qiao <pqiao@svi.edu.au>
Date: Tue, 1 Oct 2019 19:08:25 +1000
Subject: [PATCH] add references and minor adjustments

---
 course_files/book.bib          | 134 +++++++++++++++++++++++++++++++++
 course_files/clust-intro.Rmd   |  32 +++++---
 course_files/clustering.Rmd    |  32 ++++----
 course_files/latent-spaces.Rmd |  16 ++--
 4 files changed, 179 insertions(+), 35 deletions(-)

diff --git a/course_files/book.bib b/course_files/book.bib
index 0af2137..5244f1a 100644
--- a/course_files/book.bib
+++ b/course_files/book.bib
@@ -1127,3 +1127,137 @@ doi = {10.18637/jss.v059.i10}
   language = "en",
   doi      = "10.1101/574574"
 }
+
+
+@article{xu2015identification,
+  title={Identification of cell types from single-cell transcriptomes using a novel clustering method},
+  author={Xu, Chen and Su, Zhengchang},
+  journal={Bioinformatics},
+  volume={31},
+  number={12},
+  pages={1974--1980},
+  year={2015},
+  publisher={Oxford University Press}
+}
+
+
+@article{newman2004finding,
+  title={Finding and evaluating community structure in networks},
+  author={Newman, Mark EJ and Girvan, Michelle},
+  journal={Physical review E},
+  volume={69},
+  number={2},
+  pages={026113},
+  year={2004},
+  publisher={APS}
+}
+
+@article{blondel2008fast,
+  title={Fast unfolding of communities in large networks},
+  author={Blondel, Vincent D and Guillaume, Jean-Loup and Lambiotte, Renaud and Lefebvre, Etienne},
+  journal={Journal of statistical mechanics: theory and experiment},
+  volume={2008},
+  number={10},
+  pages={P10008},
+  year={2008},
+  publisher={IOP Publishing}
+}
+
+@article{traag2019louvain,
+  title={From Louvain to Leiden: guaranteeing well-connected communities},
+  author={Traag, Vincent A and Waltman, Ludo and van Eck, Nees Jan},
+  journal={Scientific reports},
+  volume={9},
+  year={2019},
+  publisher={Nature Publishing Group}
+}
+
+@article{good2010performance,
+  title={Performance of modularity maximization in practical contexts},
+  author={Good, Benjamin H and De Montjoye, Yves-Alexandre and Clauset, Aaron},
+  journal={Physical Review E},
+  volume={81},
+  number={4},
+  pages={046106},
+  year={2010},
+  publisher={APS}
+}
+
+@article{freytag2018comparison,
+  title={Comparison of clustering tools in R for medium-sized 10x Genomics single-cell RNA-sequencing data},
+  author={Freytag, Saskia and Tian, Luyi and L{\"o}nnstedt, Ingrid and Ng, Milica and Bahlo, Melanie},
+  journal={F1000Research},
+  volume={7},
+  year={2018},
+  publisher={Faculty of 1000 Ltd}
+}
+
+@inproceedings{collins2002generalization,
+  title={A generalization of principal components analysis to the exponential family},
+  author={Collins, Michael and Dasgupta, Sanjoy and Schapire, Robert E},
+  booktitle={Advances in neural information processing systems},
+  pages={617--624},
+  year={2002}
+}
+
+@inproceedings{hinton2003stochastic,
+  title={Stochastic neighbor embedding},
+  author={Hinton, Geoffrey E and Roweis, Sam T},
+  booktitle={Advances in neural information processing systems},
+  pages={857--864},
+  year={2003}
+}
+
+@article{maaten2008visualizing,
+  title={Visualizing data using t-SNE},
+  author={Maaten, Laurens van der and Hinton, Geoffrey},
+  journal={Journal of machine learning research},
+  volume={9},
+  number={Nov},
+  pages={2579--2605},
+  year={2008}
+}
+
+@article{moon2017phate,
+  title={PHATE: a dimensionality reduction method for visualizing trajectory structures in high-dimensional biological data},
+  author={Moon, Kevin R and van Dijk, David and Wang, Zheng and Chen, William and Hirn, Matthew J and Coifman, Ronald R and Ivanova, Natalia B and Wolf, Guy and Krishnaswamy, Smita},
+  journal={bioRxiv},
+  pages={120378},
+  year={2017},
+  publisher={Cold Spring Harbor Laboratory}
+}
+
+@article{buettner2017f,
+  title={f-scLVM: scalable and versatile factor analysis for single-cell RNA-seq},
+  author={Buettner, Florian and Pratanwanich, Naruemon and McCarthy, Davis J and Marioni, John C and Stegle, Oliver},
+  journal={Genome biology},
+  volume={18},
+  number={1},
+  pages={212},
+  year={2017},
+  publisher={BioMed Central}
+}
+
+@article{kingma2013auto,
+  title={Auto-encoding variational bayes},
+  author={Kingma, Diederik P and Welling, Max},
+  journal={arXiv preprint arXiv:1312.6114},
+  year={2013}
+}
+
+
+@article{mcinnes2018umap,
+  title={Umap: Uniform manifold approximation and projection for dimension reduction},
+  author={McInnes, Leland and Healy, John and Melville, James},
+  journal={arXiv preprint arXiv:1802.03426},
+  year={2018}
+}
+
+@article{townes2019feature,
+  title={Feature Selection and Dimension Reduction for Single Cell RNA-Seq based on a Multinomial Model},
+  author={Townes, F William and Hicks, Stephanie C and Aryee, Martin J and Irizarry, Rafael A},
+  journal={bioRxiv},
+  pages={574574},
+  year={2019},
+  publisher={Cold Spring Harbor Laboratory}
+}
\ No newline at end of file
diff --git a/course_files/clust-intro.Rmd b/course_files/clust-intro.Rmd
index 47b126b..f91ed43 100644
--- a/course_files/clust-intro.Rmd
+++ b/course_files/clust-intro.Rmd
@@ -108,7 +108,7 @@ to scRNA-seq data by building a graph where each vertice represents a cell
 and (weight of) the edge measures similarity between two cells.  
 Actually, graph-based clustering is the most popular clustering algorithm in 
 scRNA-seq data analysis, and has been reported to have outperformed other
-clustering methods in many situations (ref).
+clustering methods in many situations [@freytag2018comparison].
 
 
 ##### Why do we want to represent the data as a graph?\
@@ -123,12 +123,12 @@ clustering methods in many situations (ref).
 
 - __Step2__: Add weights, and obtain a shared nearest neighbour (__SNN__) graph
 
-<center>![](figures/SNN.jpg){width= 4%}</center>
+<center>![](figures/SNN.jpg){width=40%}</center>
 
 
 There are two ways of adding weights: number and rank.\
 - _number_: The number of shared nodes between $u$ and $v$, in this case, 3.    \
-- _rank_: A measurement of the closeness to their common nearest neighbours. (ref) \
+- _rank_: A measurement of the closeness to their common nearest neighbours. (@xu2015identification) \
 
 
  <font color="#bf812d">   
@@ -145,28 +145,37 @@ $$ w(u, v) = K -  s(u, v).$$
 
  
 #####  Quality function (Modularity)\
-Modularity is not the only quality function for graph-based clustering, 
+Modularity [@newman2004finding] is not the only quality function for graph-based clustering, 
 but it is one of the first attempts to embed in a compact form many questions including
-<font color="red"> ... </font>.\
+the definition of quality function and null model etc.\
 
 __The idea of modularity__: A random graph should not have a cluster structure. \
 The more "quality" a partition has compared to a random graph, the "better" the partition is.\
 Specifically, it is defined by:
 
  the  <font color="#bf812d">  quality </font> of a partition on the actual graph $-$ the quality of the same partition on a   <font color="#bf812d"> random graph </font>
-   
+  
   <font color="#bf812d"> quality </font>: Sum of the weights within clusters \
    <font color="#bf812d"> random graph </font>: a copy of the original graph, with some of its properties, but without community structure.   The random graph defined by modularity is: each node has the same degree as the original graph. 
      
-  $$ Q \propto \sum_{i, j} A_{i, j} \delta(i, j) - \sum_{i, j} \dfrac{k_i k_j}{2m} \delta(i, j)$$
-<font color="red"> [notations] </font>
+  $$ Q \propto \sum_{i, j} A_{i, j} \delta(i, j) - \sum_{i, j} \dfrac{k_i k_j}{2m} \delta(i, j)$$ 
+
+- $A_{i, j}$: weight between node $i$ and $j$; 
+
+- $\delta(i, j)$: indicator of whether $i$ and $j$ are in the same cluster; 
+
+- $k_i$: the degree of node $i$ (the sum of weights of all edges connected to $i$);
+
+- $m$: the total weight in the all graph.
+
+
 
 
 __Higher modularity implies better partition__:
 <center>![](figures/modularity.jpg){width=80%}</center>
 
 
-__Limits of modularity__: \
+__Limits of modularity__: [@good2010performance]\
 1. Resolution limit. \
    Short version: Modularity maximization forces small communities into larger ones. \
    Longer version: For two clusters $A$ and $B$, if $k_A k_B < 2m$ then modularity increases by merging A and B into a single cluster, even if A and B are distinct clusters.\
@@ -182,12 +191,13 @@ __Limits of modularity__: \
 Modularity-based clustering methods implemented in single cell analysis are mostly greedy algorithms, 
 that are very fast, although not the most accurate approaches. 
 
-   &nbsp;  &nbsp;  __Louvain__:
+   &nbsp;  &nbsp;  __Louvain__: [@blondel2008fast] 
    
 <center>![](figures/Louvain.jpg){width=80%}</center>
 
     
-   &nbsp;   &nbsp;  __Leiden__: Improved Louvain, hybrid of greedy algorithm and sampling technique \
+   &nbsp;   &nbsp;  __Leiden__:[@traag2019louvain] \
+   Improved Louvain, hybrid of greedy algorithm and sampling technique \
    
 ##### __Advantages__: \
   -Fast \
diff --git a/course_files/clustering.Rmd b/course_files/clustering.Rmd
index 48d434c..89cc2a7 100644
--- a/course_files/clustering.Rmd
+++ b/course_files/clustering.Rmd
@@ -59,14 +59,14 @@ Perform Louvain clustering:
 ```{r clustering}
 cl <- igraph::cluster_louvain(deng15)$membership
 colData(deng)$cl <- factor(cl)
-mclust::adjustedRandIndex(colData(deng)$cell_type1, colData(deng)$cl)
+mclust::adjustedRandIndex(colData(deng)$cell_type2, colData(deng)$cl)
 ```
 Reaches very high similarity with the labels provided in the original paper. 
 
 However, it tend to merge small clusters into larger ones. 
 
 ```{r}
-table(deng$cell_type1, cl)
+table(deng$cell_type2, cl)
 ```
 
 
@@ -94,19 +94,15 @@ table(muraro$cell_type1, cl)
 
 Let's run `SC3` clustering on the Deng data. The advantage of the `SC3` is that it can directly ingest a `SingleCellExperiment` object.
 
-Now let's image we do not know the number of clusters _k_ (cell types). `SC3` can estimate a number of clusters for you:
-```{r, eval= F}
+`SC3` can estimate a number of clusters:
+```{r}
 deng <- sc3_estimate_k(deng)
 metadata(deng)$sc3$k_estimation
 ```
 
-Interestingly, the number of cell types predicted by `SC3` is smaller than in the original data annotation. However, early, mid and late stages of different cell types together, we will have exactly 6 cell types. We store the merged cell types in `cell_type1` column of the `colData` slot:
-```{r, eval= F}
-plotPCA(deng, colour_by = "cell_type1")
-```
 
-Now we are ready to run `SC3` (we also ask it to calculate biological properties of the clusters): 
-```{r, eval= F}
+Next we  run `SC3` (we also ask it to calculate biological properties of the clusters): 
+```{r}
 deng <- sc3(deng, ks = 10, biology = TRUE, n_cores = 1)
 ```
 
@@ -118,27 +114,27 @@ sc3_plot_consensus(deng, k = 10, show_pdata = "cell_type2")
 ```
 
 Silhouette plot:
-```{r, fig.height=9, eval= F}
+```{r, fig.height=9}
 sc3_plot_silhouette(deng, k = 10)
 ```
 
 Heatmap of the expression matrix:
-```{r, fig.height=6, eval= F}
+```{r, fig.height=6}
 sc3_plot_expression(deng, k = 10, show_pdata = "cell_type2")
 ```
 
 Identified marker genes:
-```{r, fig.height=11, eval= F}
+```{r, fig.height=11}
 sc3_plot_markers(deng, k = 10, show_pdata = "cell_type2")
 ```
 
 PCA plot with highlighted `SC3` clusters:
-```{r, eval= F}
+```{r}
 plotPCA(deng, colour_by = "sc3_10_clusters")
 ```
 
 Compare the results of `SC3` clustering with the original publication cell type labels:
-```{r, eval= F}
+```{r}
 adjustedRandIndex(colData(deng)$cell_type2, colData(deng)$sc3_10_clusters)
 ```
 
@@ -176,7 +172,9 @@ __Note__ Due to direct calculation of distances `SC3` becomes very slow when the
   <center> ![](figures/SingleR_score.png){width=80%} </center>
  - __Step4. Fine tuning__\
   We stop here and assign each cell with label that score the highest, actually, if we set the argument ```fine.tune = FALSE```, that is exactly what the package function ```SingleR``` does. 
-  But there is one more question, what if the second highest score is very close to the highest? 
+  But there is one more question, what if the second highest score is very close to the highest? say, 1, 1, 1, 9.5, 10.
+  `SingleR` set a threshold to define how close is "very close", the default is 0.05. 
+  For (only) the cells that falls into this category, it goes back to Step2.
 
 
 #### Example
@@ -314,4 +312,4 @@ plot(
 sessionInfo()
 ```
 
-Among the 2126 cells in the data, only 89 are annotated as different labels as the 
+
diff --git a/course_files/latent-spaces.Rmd b/course_files/latent-spaces.Rmd
index ec109be..922221b 100644
--- a/course_files/latent-spaces.Rmd
+++ b/course_files/latent-spaces.Rmd
@@ -107,7 +107,9 @@ plotPCA(deng, colour_by = "cell_type2") +
 non-linear dependencies. For instance, PCA would not be able to “unroll” the
 following structure.\ <center> ![](figures/pca_limit.png){width=30%} </center>
 
-#### GLM-PCA
+#### [GLM-PCA](https://rdrr.io/cran/glmpca/) 
+[@collins2002generalization] 
+[@townes2019feature]
 
 GLM-PCA is a generalized version of the traditional PCA. 
 
@@ -217,8 +219,7 @@ ggplot(pd, aes(x=dim1, y=dim2, shape=clust, colour=batch)) +
 
 ### tSNE: t-Distributed Stochastic Neighbor Embedding
 
-t-SNE is an advanced version of the original SNE algorithm. <font color="red">
-[ref] </font>
+t-SNE [@maaten2008visualizing] is an advanced version of the original SNE algorithm. [@hinton2003stochastic]
 
 #### Motivation 
 
@@ -321,7 +322,7 @@ Therefore can merely be used for visualization.\
 
 ### Manifold methods
 
-#### UMAP: Uniform Manifold Approximation and Projection
+#### UMAP: Uniform Manifold Approximation and Projection [@mcinnes2018umap]
 
 ##### __Advantages of UMAP over t-SNE:__
 
@@ -368,7 +369,7 @@ plotUMAP(muraro, colour_by="cell_type1")
   
   
 
-#### PHATE
+#### PHATE [@moon2017phate]
 
 ##### Sketch of algorithm 
 
@@ -450,7 +451,7 @@ The __factor loadings__ or weights indicate how much each latent factor is affec
 
 ### [Slalom](https://bioconductor.org/packages/release/bioc/html/slalom.html): Interpretable latent spaces
 
-Highlight of Slalom:
+Highlight of Slalom: [@buettner2017f]
 
 - It incorporates prior information to help the model estimation; 
 
@@ -524,7 +525,8 @@ The `plotTerms` function shows the relevance of all terms in the model, enabling
 plotTerms(model_deng)
 ```
 
-## Autoencoders
+## Autoencoders 
+[@kingma2013auto]
 
 <center>![](figures/vae.jpg){width=80%}</center>
 
-- 
GitLab