Commit 835ad35d authored by Ruqian Lyu's avatar Ruqian Lyu
Browse files

fix conflix

parents 2dd45245 907b2203
Pipeline #943 passed with stage
in 4 seconds
......@@ -180,6 +180,7 @@ RUN Rscript -e 'install.packages(c("rsample"))'
RUN Rscript -e 'install.packages(c("Rcpp"))'
RUN Rscript -e 'install.packages(c("rstan"))'
RUN Rscript -e 'install.packages(c("gam"))'
RUN Rscript -e 'devtools::install_github(c("cole-trapnell-lab/monocle3"))'
RUN Rscript -e 'devtools::install_github(c("immunogenomics/harmony", "LTLA/beachmat", "MarioniLab/DropletUtils", "tallulandrews/M3Drop", "hemberg-lab/scRNA.seq.funcs"))'
......
......@@ -1129,9 +1129,193 @@ doi = {10.18637/jss.v059.i10}
}
@ARTICLE{Bais2019-hf,
title = "scds: Computational Annotation of Doublets in {Single-Cell} {RNA}
Sequencing Data",
@article{newman2004finding,
title={Finding and evaluating community structure in networks},
author={Newman, Mark EJ and Girvan, Michelle},
journal={Physical review E},
volume={69},
number={2},
pages={026113},
year={2004},
publisher={APS}
}
@article{blondel2008fast,
title={Fast unfolding of communities in large networks},
author={Blondel, Vincent D and Guillaume, Jean-Loup and Lambiotte, Renaud and Lefebvre, Etienne},
journal={Journal of statistical mechanics: theory and experiment},
volume={2008},
number={10},
pages={P10008},
year={2008},
publisher={IOP Publishing}
}
@article{traag2019louvain,
title={From Louvain to Leiden: guaranteeing well-connected communities},
author={Traag, Vincent A and Waltman, Ludo and van Eck, Nees Jan},
journal={Scientific reports},
volume={9},
year={2019},
publisher={Nature Publishing Group}
}
@article{good2010performance,
title={Performance of modularity maximization in practical contexts},
author={Good, Benjamin H and De Montjoye, Yves-Alexandre and Clauset, Aaron},
journal={Physical Review E},
volume={81},
number={4},
pages={046106},
year={2010},
publisher={APS}
}
@article{freytag2018comparison,
title={Comparison of clustering tools in R for medium-sized 10x Genomics single-cell RNA-sequencing data},
author={Freytag, Saskia and Tian, Luyi and L{\"o}nnstedt, Ingrid and Ng, Milica and Bahlo, Melanie},
journal={F1000Research},
volume={7},
year={2018},
publisher={Faculty of 1000 Ltd}
}
@inproceedings{collins2002generalization,
title={A generalization of principal components analysis to the exponential family},
author={Collins, Michael and Dasgupta, Sanjoy and Schapire, Robert E},
booktitle={Advances in neural information processing systems},
pages={617--624},
year={2002}
}
@inproceedings{hinton2003stochastic,
title={Stochastic neighbor embedding},
author={Hinton, Geoffrey E and Roweis, Sam T},
booktitle={Advances in neural information processing systems},
pages={857--864},
year={2003}
}
@article{maaten2008visualizing,
title={Visualizing data using t-SNE},
author={Maaten, Laurens van der and Hinton, Geoffrey},
journal={Journal of machine learning research},
volume={9},
number={Nov},
pages={2579--2605},
year={2008}
}
@article{moon2017phate,
title={PHATE: a dimensionality reduction method for visualizing trajectory structures in high-dimensional biological data},
author={Moon, Kevin R and van Dijk, David and Wang, Zheng and Chen, William and Hirn, Matthew J and Coifman, Ronald R and Ivanova, Natalia B and Wolf, Guy and Krishnaswamy, Smita},
journal={bioRxiv},
pages={120378},
year={2017},
publisher={Cold Spring Harbor Laboratory}
}
@article{buettner2017f,
title={f-scLVM: scalable and versatile factor analysis for single-cell RNA-seq},
author={Buettner, Florian and Pratanwanich, Naruemon and McCarthy, Davis J and Marioni, John C and Stegle, Oliver},
journal={Genome biology},
volume={18},
number={1},
pages={212},
year={2017},
publisher={BioMed Central}
}
@article{kingma2013auto,
title={Auto-encoding variational bayes},
author={Kingma, Diederik P and Welling, Max},
journal={arXiv preprint arXiv:1312.6114},
year={2013}
}
@article{mcinnes2018umap,
title={Umap: Uniform manifold approximation and projection for dimension reduction},
author={McInnes, Leland and Healy, John and Melville, James},
journal={arXiv preprint arXiv:1802.03426},
year={2018}
}
@article{townes2019feature,
title={Feature Selection and Dimension Reduction for Single Cell RNA-Seq based on a Multinomial Model},
author={Townes, F William and Hicks, Stephanie C and Aryee, Martin J and Irizarry, Rafael A},
journal={bioRxiv},
pages={574574},
year={2019},
publisher={Cold Spring Harbor Laboratory}
}
@ARTICLE{Soneson2018-hy,
title = "{Bias, robustness and scalability in single-cell differential
expression analysis}",
author = "Soneson, Charlotte and Robinson, Mark D",
abstract = "Many methods have been used to determine differential gene
expression from single-cell RNA (scRNA)-seq data. We evaluated
36 approaches using experimental and synthetic data and found
considerable differences in the number and characteristics of
the genes that are called differentially expressed. Prefiltering
of lowly expressed genes has important effects, particularly for
some of the methods developed for bulk RNA-seq data analysis.
However, we found that bulk RNA-seq analysis methods do not
generally perform worse than those developed specifically for
scRNA-seq. We also present conquer, a repository of consistently
processed, analysis-ready public scRNA-seq data sets that is
aimed at simplifying method evaluation and reanalysis of
published results. Each data set provides abundance estimates
for both genes and transcripts, as well as quality control and
exploratory analysis reports.",
journal = "Nature methods",
publisher = "Nature Publishing Group, a division of Macmillan Publishers
Limited. All Rights Reserved.",
month = feb,
year = 2018,
url = "http://dx.doi.org/10.1038/nmeth.4612",
issn = "1548-7091",
doi = "10.1038/nmeth.4612"
}
@ARTICLE{Finak2015-ow,
title = "{MAST: a flexible statistical framework for assessing
transcriptional changes and characterizing heterogeneity in
single-cell RNA sequencing data}",
author = "Finak, Greg and McDavid, Andrew and Yajima, Masanao and Deng,
Jingyuan and Gersuk, Vivian and Shalek, Alex K and Slichter,
Chloe K and Miller, Hannah W and McElrath, M Juliana and Prlic,
Martin and Linsley, Peter S and Gottardo, Raphael",
abstract = "Single-cell transcriptomics reveals gene expression heterogeneity
but suffers from stochastic dropout and characteristic bimodal
expression distributions in which expression is either strongly
non-zero or non-detectable. We propose a two-part, generalized
linear model for such bimodal data that parameterizes both of
these features. We argue that the cellular detection rate, the
fraction of genes expressed in a cell, should be adjusted for as
a source of nuisance variation. Our model provides gene set
enrichment analysis tailored to single-cell data. It provides
insights into how networks of co-expressed genes evolve across an
experimental treatment. MAST is available at
https://github.com/RGLab/MAST .",
journal = "Genome biology",
volume = 16,
number = 1,
pages = "1--13",
year = 2015,
url = "http://dx.doi.org/10.1186/s13059-015-0844-5",
issn = "1465-6906, 1474-760X",
doi = "10.1186/s13059-015-0844-5"
}
@ARTICLE{Bais2019-wv,
title = "{scds: Computational Annotation of Doublets in Single-Cell RNA
Sequencing Data}",
author = "Bais, Abha S and Kostka, Dennis",
abstract = "MOTIVATION: Single-cell RNA sequencing (scRNA-seq) technologies
enable the study of transcriptional heterogeneity at the
......@@ -1166,4 +1350,9 @@ doi = {10.18637/jss.v059.i10}
month = sep,
year = 2019,
language = "en"
url = "http://dx.doi.org/10.1093/bioinformatics/btz698",
language = "en",
issn = "1367-4803, 1367-4811",
pmid = "31501871",
doi = "10.1093/bioinformatics/btz698"
}
......@@ -46,7 +46,7 @@ that it is typically much easier to visualize the data in a 2 or
* Scalability: in the last few years the number of cells in scRNA-seq experiments has grown by several orders of magnitude from ~$10^2$ to ~$10^6$
### unsupervised Clustering methods
### Unsupervised clustering methods
Three main ingredients of a complete clustering method:
......@@ -108,7 +108,7 @@ to scRNA-seq data by building a graph where each vertice represents a cell
and (weight of) the edge measures similarity between two cells.
Actually, graph-based clustering is the most popular clustering algorithm in
scRNA-seq data analysis, and has been reported to have outperformed other
clustering methods in many situations (ref).
clustering methods in many situations [@freytag2018comparison].
##### Why do we want to represent the data as a graph?\
......@@ -123,12 +123,12 @@ clustering methods in many situations (ref).
- __Step2__: Add weights, and obtain a shared nearest neighbour (__SNN__) graph
<center>![](figures/SNN.jpg){width= 4%}</center>
<center>![](figures/SNN.jpg){width=40%}</center>
There are two ways of adding weights: number and rank.\
- _number_: The number of shared nodes between $u$ and $v$, in this case, 3. \
- _rank_: A measurement of the closeness to their common nearest neighbours. (ref) \
- _rank_: A measurement of the closeness to their common nearest neighbours. (@xu2015identification) \
<font color="#bf812d">
......@@ -145,28 +145,37 @@ $$ w(u, v) = K - s(u, v).$$
##### Quality function (Modularity)\
Modularity is not the only quality function for graph-based clustering,
Modularity [@newman2004finding] is not the only quality function for graph-based clustering,
but it is one of the first attempts to embed in a compact form many questions including
<font color="red"> ... </font>.\
the definition of quality function and null model etc.\
__The idea of modularity__: A random graph should not have a cluster structure. \
The more "quality" a partition has compared to a random graph, the "better" the partition is.\
Specifically, it is defined by:
the <font color="#bf812d"> quality </font> of a partition on the actual graph $-$ the quality of the same partition on a <font color="#bf812d"> random graph </font>
<font color="#bf812d"> quality </font>: Sum of the weights within clusters \
<font color="#bf812d"> random graph </font>: a copy of the original graph, with some of its properties, but without community structure. The random graph defined by modularity is: each node has the same degree as the original graph.
$$ Q \propto \sum_{i, j} A_{i, j} \delta(i, j) - \sum_{i, j} \dfrac{k_i k_j}{2m} \delta(i, j)$$
<font color="red"> [notations] </font>
$$ Q \propto \sum_{i, j} A_{i, j} \delta(i, j) - \sum_{i, j} \dfrac{k_i k_j}{2m} \delta(i, j)$$
- $A_{i, j}$: weight between node $i$ and $j$;
- $\delta(i, j)$: indicator of whether $i$ and $j$ are in the same cluster;
- $k_i$: the degree of node $i$ (the sum of weights of all edges connected to $i$);
- $m$: the total weight in the all graph.
__Higher modularity implies better partition__:
<center>![](figures/modularity.jpg){width=80%}</center>
__Limits of modularity__: \
__Limits of modularity__: [@good2010performance]\
1. Resolution limit. \
Short version: Modularity maximization forces small communities into larger ones. \
Longer version: For two clusters $A$ and $B$, if $k_A k_B < 2m$ then modularity increases by merging A and B into a single cluster, even if A and B are distinct clusters.\
......@@ -182,12 +191,13 @@ __Limits of modularity__: \
Modularity-based clustering methods implemented in single cell analysis are mostly greedy algorithms,
that are very fast, although not the most accurate approaches.
&nbsp; &nbsp; __Louvain__:
&nbsp; &nbsp; __Louvain__: [@blondel2008fast]
<center>![](figures/Louvain.jpg){width=80%}</center>
&nbsp; &nbsp; __Leiden__: Improved Louvain, hybrid of greedy algorithm and sampling technique \
&nbsp; &nbsp; __Leiden__:[@traag2019louvain] \
Improved Louvain, hybrid of greedy algorithm and sampling technique \
##### __Advantages__: \
-Fast \
......@@ -199,7 +209,7 @@ that are very fast, although not the most accurate approaches.
#### Concensus clustering (more robustness, less computational speed)
#### Consensus clustering (more robustness, less computational speed)
##### __Motivation (Two problems of $K$-means)__: \
- __Problem1:__ sensitive to initial partitions \
......@@ -209,7 +219,7 @@ that are very fast, although not the most accurate approaches.
&nbsp; __Solution:__
&nbsp; Run $K$-means with a range of $K$'s.
##### __Algorithm of concensus clustering (simpliest version)__:
##### __Algorithm of consensus clustering (simpliest version)__:
```{r, eval = F, highlight = F}
for(k in the range of K){
for(each subsample of the data){
......@@ -242,7 +252,7 @@ Say we partitioned four data points into 2 clusters.
<center>![](figures/concensus1.png){width=60%}</center>
- __Step2:__ Concensus matrix: \
- __Step2:__ Consensus matrix: \
Average of all the partitions
<center>![](figures/concensus2.png){width=30%}</center>
......
......@@ -59,14 +59,14 @@ Perform Louvain clustering:
```{r clustering}
cl <- igraph::cluster_louvain(deng15)$membership
colData(deng)$cl <- factor(cl)
mclust::adjustedRandIndex(colData(deng)$cell_type1, colData(deng)$cl)
mclust::adjustedRandIndex(colData(deng)$cell_type2, colData(deng)$cl)
```
Reaches very high similarity with the labels provided in the original paper.
However, it tend to merge small clusters into larger ones.
```{r}
table(deng$cell_type1, cl)
table(deng$cell_type2, cl)
```
......@@ -94,19 +94,15 @@ table(muraro$cell_type1, cl)
Let's run `SC3` clustering on the Deng data. The advantage of the `SC3` is that it can directly ingest a `SingleCellExperiment` object.
Now let's image we do not know the number of clusters _k_ (cell types). `SC3` can estimate a number of clusters for you:
```{r, eval= F}
`SC3` can estimate a number of clusters:
```{r}
deng <- sc3_estimate_k(deng)
metadata(deng)$sc3$k_estimation
```
Interestingly, the number of cell types predicted by `SC3` is smaller than in the original data annotation. However, early, mid and late stages of different cell types together, we will have exactly 6 cell types. We store the merged cell types in `cell_type1` column of the `colData` slot:
```{r, eval= F}
plotPCA(deng, colour_by = "cell_type1")
```
Now we are ready to run `SC3` (we also ask it to calculate biological properties of the clusters):
```{r, eval= F}
Next we run `SC3` (we also ask it to calculate biological properties of the clusters):
```{r}
deng <- sc3(deng, ks = 10, biology = TRUE, n_cores = 1)
```
......@@ -118,27 +114,27 @@ sc3_plot_consensus(deng, k = 10, show_pdata = "cell_type2")
```
Silhouette plot:
```{r, fig.height=9, eval= F}
```{r, fig.height=9}
sc3_plot_silhouette(deng, k = 10)
```
Heatmap of the expression matrix:
```{r, fig.height=6, eval= F}
```{r, fig.height=6}
sc3_plot_expression(deng, k = 10, show_pdata = "cell_type2")
```
Identified marker genes:
```{r, fig.height=11, eval= F}
```{r, fig.height=11}
sc3_plot_markers(deng, k = 10, show_pdata = "cell_type2")
```
PCA plot with highlighted `SC3` clusters:
```{r, eval= F}
```{r}
plotPCA(deng, colour_by = "sc3_10_clusters")
```
Compare the results of `SC3` clustering with the original publication cell type labels:
```{r, eval= F}
```{r}
adjustedRandIndex(colData(deng)$cell_type2, colData(deng)$sc3_10_clusters)
```
......@@ -176,7 +172,9 @@ __Note__ Due to direct calculation of distances `SC3` becomes very slow when the
<center> ![](figures/SingleR_score.png){width=80%} </center>
- __Step4. Fine tuning__\
We stop here and assign each cell with label that score the highest, actually, if we set the argument ```fine.tune = FALSE```, that is exactly what the package function ```SingleR``` does.
But there is one more question, what if the second highest score is very close to the highest?
But there is one more question, what if the second highest score is very close to the highest? say, 1, 1, 1, 9.5, 10.
`SingleR` set a threshold to define how close is "very close", the default is 0.05.
For (only) the cells that falls into this category, it goes back to Step2.
#### Example
......@@ -314,4 +312,4 @@ plot(
sessionInfo()
```
Among the 2126 cells in the data, only 89 are annotated as different labels as the
......@@ -3,7 +3,7 @@ knit: bookdown::preview_chapter
---
```{r setup, echo=FALSE}
knitr::opts_chunk$set(out.width='90%', fig.align = 'center', echo=FALSE, eval=TRUE)
knitr::opts_chunk$set(out.width='90%', fig.align = 'center', echo=FALSE, eval=TRUE, warning=FALSE, message=FALSE)
knitr::opts_knit$set(root.dir = normalizePath(".."))
```
......
......@@ -4,7 +4,7 @@ knit: bookdown::preview_chapter
```{r, echo=FALSE}
library(knitr)
opts_chunk$set(out.width='90%', fig.align = 'center', eval=TRUE)
opts_chunk$set(out.width='90%', fig.align = 'center', eval=TRUE, warning=FALSE, message=FALSE)
knitr::opts_knit$set(root.dir = normalizePath(".."))
```
......
......@@ -13,38 +13,95 @@ knitr::opts_knit$set(root.dir = normalizePath(".."))
### Bulk RNA-seq
One of the most common types of analyses when working with bulk RNA-seq
data is to identify differentially expressed genes. By comparing the
genes that change between two conditions, e.g. mutant and wild-type or
stimulated and unstimulated, it is possible to characterize the
molecular mechanisms underlying the change.
Several different methods,
e.g. [DESeq2](https://bioconductor.org/packages/DESeq2) and
[edgeR](https://bioconductor.org/packages/release/bioc/html/edgeR.html),
have been developed for bulk RNA-seq. Moreover, there are also
extensive
One of the most common types of analyses when working with bulk RNA-seq data is
to identify differentially expressed genes. By comparing the genes that change
between two or more conditions, e.g. mutant and wild-type or stimulated and
unstimulated, it is possible to characterize the molecular mechanisms underlying
the change.
Several different methods, e.g.
[edgeR](https://bioconductor.org/packages/release/bioc/html/edgeR.html) and
[DESeq2](https://bioconductor.org/packages/DESeq2) and more, have been developed
for bulk RNA-seq and become established as parts of robust and widely-used
analysis workflows. Moreover, there are also extensive
[datasets](http://genomebiology.biomedcentral.com/articles/10.1186/gb-2013-14-9-r95)
available where the RNA-seq data has been validated using
RT-qPCR. These data can be used to benchmark DE finding algorithms and the available evidence suggests that the algorithms are performing quite well.
available where the RNA-seq data has been validated using RT-qPCR. These data
can be used to benchmark DE finding algorithms and the available evidence
suggests that the algorithms are performing well.
### Single cell RNA-seq
In contrast to bulk RNA-seq, in scRNA-seq we usually do not have a defined
set of experimental conditions. Instead, as was shown in a previous chapter
In contrast to bulk RNA-seq, in scRNA-seq we often do not have a defined set
of experimental conditions. Instead, as was shown in a previous chapter
(\@ref(clust-methods)) we can identify the cell groups by using an unsupervised
clustering approach. Once the groups have been identified one can find differentially
expressed genes either by comparing the differences in variance between the groups (like the Kruskal-Wallis test implemented in SC3), or by comparing gene expression between clusters in a pairwise manner. In the following chapter we will mainly consider tools developed for pairwise comparisons.
clustering approach. Once the groups have been identified one can find
differentially expressed genes either by comparing the differences in variance
between the groups (like the Kruskal-Wallis test implemented in SC3), or by
comparing gene expression between clusters in a pairwise manner. In the
following chapter we will mainly consider tools developed for pairwise
comparisons.
These method may also be applied when comparing cells obtained from different
groups or conditions. Such analyses can be complicated by differing cell type
proportions between samples (i.e. distinct samples cell populations; the unit of
replication in the study). In such cases, it is likely beneficial to identify
distinct cell types and conduct differential expression testing between
conditions within each cell type.
### Differences in Distribution
Unlike bulk RNA-seq, we generally have a large number of samples (i.e. cells) for each group we are comparing in single-cell experiments. Thus we can take advantage of the whole distribution of expression values in each group to identify differences between groups rather than only comparing estimates of mean-expression as is standard for bulk RNASeq.
Unlike bulk RNA-seq, we generally have a large number of samples (i.e. cells)
for each group we are comparing in single-cell experiments. Thus we may be able
to take advantage of the whole distribution of expression values in each group
to identify differences between groups rather than only comparing estimates of
mean-expression as is standard for bulk RNASeq.
There are two main approaches to comparing distributions. Firstly, we can use
existing statistical models/distributions and fit the same type of model to the
expression in each group then test for differences in the parameters for each
model, or test whether the model fits better if a particular parameter is allowed
to be different according to group. For instance in Chapter
\@ref(dealing-with-confounders) we used `edgeR` to test whether allowing mean
expression to be different in different batches significantly improved the fit
of a negative binomial model of the data.
Alternatively, we can use a non-parametric test which does not assume that
expression values follow any particular distribution, e.g. the
[Kolmogorov-Smirnov test
(KS-test)](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test).
Non-parametric tests generally convert observed expression values to ranks and
test whether the distribution of ranks for one group are signficantly different
from the distribution of ranks for the other group. However, some non-parametric
methods fail in the presence of a large number of tied values, such as the case
for dropouts (zeros) in single-cell RNA-seq expression data. Moreover, if the
conditions for a parametric test hold, then it will typically be more powerful
than a non-parametric test.
### Benchmarking of DE methods for scRNA-seq data
So far there has been one high-quality benchmarking study of single-cell
differential expression methods [@Soneson2018-hy]. The figure below summarises
the results from that paper (which is well worth reading in full!):
```{r de-benchmarking, out.width='90%', fig.cap="Figure 5 reproduced from Soneson and Robinson (2018). Summary of DE method performance across all major evaluation criteria. Criteria and cutoff values for performance categories are available in the Online Methods. Methods are ranked by their average performance across the criteria, with the numerical encoding good = 2, intermediate = 1, poor = 0. NODES and SAMseq do not return nominal P values and were therefore not evaluated in terms of the FPR."}
knitr::include_graphics("figures/soneson-de-benchmark-fig5.png")
```
There are two main approaches to comparing distributions. Firstly, we can use existing statistical models/distributions and fit the same type of model to the expression in each group then test for differences in the parameters for each model, or test whether the model fits better if a particular paramter is allowed to be different according to group. For instance in Chapter \@ref(dealing-with-confounders) we used edgeR to test whether allowing mean expression to be different in different batches significantly improved the fit of a negative binomial model of the data.
One particularly surprising outcome of this benchmarking study is that almost
all methods designed specifically for the analysis of scRNA-seq data are
outperformed by established bulk RNA-seq DE methods (edgeR, limma) and standard,
classical statistical methods (t-test, Wilcoxon rank-sum tests). MAST
[@Finak2015-ow] is the only method designed specifically for scRNA-seq data that
performs well in this benchmark. These benchmarking results are a credit to the
durability and flexibility of the leading bulk RNA-seq DE methods and a subtle
indictment of the land rush of new scRNA-seq methods that were published without
adequate comparison to existing bulk RNA-seq methods.
Alternatively, we can use a non-parametric test which does not assume that expression values follow any particular distribution, e.g. the [Kolmogorov-Smirnov test (KS-test)](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test). Non-parametric tests generally convert observed expression values to ranks and test whether the distribution of ranks for one group are signficantly different from the distribution of ranks for the other group. However, some non-parametric methods fail in the presence of a large number of tied values, such as the case for dropouts (zeros) in single-cell RNA-seq expression data. Moreover, if the conditions for a parametric test hold, then it will typically be more powerful than a non-parametric test.
### Models of single-cell RNASeq data
### Models of single-cell RNA-seq data
The most common model of RNASeq data is the negative binomial model:
......@@ -67,9 +124,21 @@ $\mu = mu$
Variance:
$\sigma^2 = mu + mu^2/size$
It is parameterized by the mean expression (mu) and the dispersion (size), which is inversely related to the variance. The negative binomial model fits bulk RNA-seq data very well and it is used for most statistical methods designed for such data. In addition, it has been show to fit the distribution of molecule counts obtained from data tagged by unique molecular identifiers (UMIs) quite well ([Grun et al. 2014](http://www.nature.com/nmeth/journal/v11/n6/full/nmeth.2930.html), [Islam et al. 2011](http://genome.cshlp.org/content/21/7/1160)).
However, a raw negative binomial model does not fit full-length transcript data as well due to the high dropout rates relative to the non-zero read counts. For this type of data a variety of zero-inflated negative binomial models have been proposed (e.g. [MAST](https://bioconductor.org/packages/release/bioc/html/MAST.html), [SCDE](https://bioconductor.org/packages/release/bioc/html/scde.html)).
It is parameterized by the mean expression (mu) and the dispersion (size), which
is inversely related to the variance. The negative binomial model fits bulk
RNA-seq data very well and it is used for most statistical methods designed for
such data. In addition, it has been show to fit the distribution of molecule
counts obtained from data tagged by unique molecular identifiers (UMIs) quite
well ([Grun et al.
2014](http://www.nature.com/nmeth/journal/v11/n6/full/nmeth.2930.html), [Islam
et al. 2011](http://genome.cshlp.org/content/21/7/1160)).
However, a raw negative binomial model does not necessarily fit full-length
transcript data as well due to the high dropout rates relative to the non-zero
read counts. For this type of data a variety of zero-inflated negative binomial
models have been proposed (e.g.
[MAST](https://bioconductor.org/packages/release/bioc/html/MAST.html),
[SCDE](https://bioconductor.org/packages/release/bioc/html/scde.html)).
```{r zero-inflation-plot, fig.cap="Zero-inflated Negative Binomial distribution"}
d <- 0.5;
......@@ -92,9 +161,20 @@ $\mu = mu \cdot (1 - d)$
Variance:
$\sigma^2 = \mu \cdot (1-d) \cdot (1 + d \cdot \mu + \mu / size)$
These models introduce a new parameter $d$, for the dropout rate, to the negative binomial model. As we saw in Chapter 19, the dropout rate of a gene is strongly correlated with the mean expression of the gene. Different zero-inflated negative binomial models use different relationships between mu and d and some may fit $\mu$ and $d$ to the expression of each gene independently.
Finally, several methods use a Poisson-Beta distribution which is based on a mechanistic model of transcriptional bursting. There is strong experimental support for this model ([Kim and Marioni, 2013](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2013-14-1-r7)) and it provides a good fit to scRNA-seq data but it is less easy to use than the negative-binomial models and much less existing methods upon which to build than the negative binomial model.
These models introduce a new parameter $d$, for the dropout rate, to the
negative binomial model. As we saw in Chapter 19, the dropout rate of a gene is
strongly correlated with the mean expression of the gene. Different
zero-inflated negative binomial models use different relationships between mu
and d and some may fit $\mu$ and $d$ to the expression of each gene
independently.
Finally, several methods use a Poisson-Beta distribution which is based on a
mechanistic model of transcriptional bursting. There is strong experimental
support for this model ([Kim and Marioni,
2013](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2013-14-1-r7))
and it provides a good fit to scRNA-seq data but it is less easy to use than the
negative-binomial models and much less existing methods upon which to build than
the negative binomial model.
```{r pois-beta-plot, fit.cap="Poisson-Beta distribution"}
a <- 0.1
......@@ -115,9 +195,17 @@ $\mu = g \cdot a / (a + b)$
Variance:
$\sigma^2 = g^2 \cdot a \cdot b/((a + b + 1) \cdot (a + b)^2)$
This model uses three parameters: $a$ the rate of activation of transcription; $b$ the rate of inhibition of transcription; and $g$ the rate of transcript production while transcription is active at the locus. Differential expression methods may test each of the parameters for differences across groups or only one (often $g$).
This model uses three parameters: $a$ the rate of activation of transcription;
$b$ the rate of inhibition of transcription; and $g$ the rate of transcript
production while transcription is active at the locus. Differential expression
methods may test each of the parameters for differences across groups or only
one (often $g$).
All of these models may be further expanded to explicitly account for other sources of gene expression differences such as batch-effect or library depth depending on the particular DE algorithm.
All of these models may be further expanded to explicitly account for other
sources of gene expression differences such as batch-effect or library depth
depending on the particular DE algorithm.
__Exercise__: Vary the parameters of each distribution to explore how they affect the distribution of gene expression. How similar are the Poisson-Beta and Negative Binomial models?
__Exercise__: Vary the parameters of each distribution to explore how they
affect the distribution of gene expression. How similar are the Poisson-Beta and
Negative Binomial models?
......@@ -4,7 +4,7 @@ output: html_document
---
```{r setup, echo=FALSE}
knitr::opts_chunk$set(out.width='90%', fig.align = 'center', eval=TRUE)