diff --git a/Dockerfile b/Dockerfile index bac78a1291055a9e522af6daf0140f2dac99d986..093cf049d0d882f9fe616dd8b28f3ff6b1a5a6a8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -180,6 +180,7 @@ RUN Rscript -e 'install.packages(c("rsample"))' RUN Rscript -e 'install.packages(c("Rcpp"))' RUN Rscript -e 'install.packages(c("rstan"))' RUN Rscript -e 'install.packages(c("gam"))' + RUN Rscript -e 'devtools::install_github(c("cole-trapnell-lab/monocle3"))' RUN Rscript -e 'devtools::install_github(c("immunogenomics/harmony", "LTLA/beachmat", "MarioniLab/DropletUtils", "tallulandrews/M3Drop", "hemberg-lab/scRNA.seq.funcs"))' diff --git a/course_files/book.bib b/course_files/book.bib index e37e01a4507a3d03465b938c100c3e135a73e212..1745dd197f13541e8cd73319f78e86a4016ec8ce 100644 --- a/course_files/book.bib +++ b/course_files/book.bib @@ -1129,9 +1129,193 @@ doi = {10.18637/jss.v059.i10} } -@ARTICLE{Bais2019-hf, - title = "scds: Computational Annotation of Doublets in {Single-Cell} {RNA} - Sequencing Data", + + +@article{newman2004finding, + title={Finding and evaluating community structure in networks}, + author={Newman, Mark EJ and Girvan, Michelle}, + journal={Physical review E}, + volume={69}, + number={2}, + pages={026113}, + year={2004}, + publisher={APS} +} + +@article{blondel2008fast, + title={Fast unfolding of communities in large networks}, + author={Blondel, Vincent D and Guillaume, Jean-Loup and Lambiotte, Renaud and Lefebvre, Etienne}, + journal={Journal of statistical mechanics: theory and experiment}, + volume={2008}, + number={10}, + pages={P10008}, + year={2008}, + publisher={IOP Publishing} +} + +@article{traag2019louvain, + title={From Louvain to Leiden: guaranteeing well-connected communities}, + author={Traag, Vincent A and Waltman, Ludo and van Eck, Nees Jan}, + journal={Scientific reports}, + volume={9}, + year={2019}, + publisher={Nature Publishing Group} +} + +@article{good2010performance, + title={Performance of modularity maximization in practical contexts}, + author={Good, Benjamin H and De Montjoye, Yves-Alexandre and Clauset, Aaron}, + journal={Physical Review E}, + volume={81}, + number={4}, + pages={046106}, + year={2010}, + publisher={APS} +} + +@article{freytag2018comparison, + title={Comparison of clustering tools in R for medium-sized 10x Genomics single-cell RNA-sequencing data}, + author={Freytag, Saskia and Tian, Luyi and L{\"o}nnstedt, Ingrid and Ng, Milica and Bahlo, Melanie}, + journal={F1000Research}, + volume={7}, + year={2018}, + publisher={Faculty of 1000 Ltd} +} + +@inproceedings{collins2002generalization, + title={A generalization of principal components analysis to the exponential family}, + author={Collins, Michael and Dasgupta, Sanjoy and Schapire, Robert E}, + booktitle={Advances in neural information processing systems}, + pages={617--624}, + year={2002} +} + +@inproceedings{hinton2003stochastic, + title={Stochastic neighbor embedding}, + author={Hinton, Geoffrey E and Roweis, Sam T}, + booktitle={Advances in neural information processing systems}, + pages={857--864}, + year={2003} +} + +@article{maaten2008visualizing, + title={Visualizing data using t-SNE}, + author={Maaten, Laurens van der and Hinton, Geoffrey}, + journal={Journal of machine learning research}, + volume={9}, + number={Nov}, + pages={2579--2605}, + year={2008} +} + +@article{moon2017phate, + title={PHATE: a dimensionality reduction method for visualizing trajectory structures in high-dimensional biological data}, + author={Moon, Kevin R and van Dijk, David and Wang, Zheng and Chen, William and Hirn, Matthew J and Coifman, Ronald R and Ivanova, Natalia B and Wolf, Guy and Krishnaswamy, Smita}, + journal={bioRxiv}, + pages={120378}, + year={2017}, + publisher={Cold Spring Harbor Laboratory} +} + +@article{buettner2017f, + title={f-scLVM: scalable and versatile factor analysis for single-cell RNA-seq}, + author={Buettner, Florian and Pratanwanich, Naruemon and McCarthy, Davis J and Marioni, John C and Stegle, Oliver}, + journal={Genome biology}, + volume={18}, + number={1}, + pages={212}, + year={2017}, + publisher={BioMed Central} +} + +@article{kingma2013auto, + title={Auto-encoding variational bayes}, + author={Kingma, Diederik P and Welling, Max}, + journal={arXiv preprint arXiv:1312.6114}, + year={2013} +} + + +@article{mcinnes2018umap, + title={Umap: Uniform manifold approximation and projection for dimension reduction}, + author={McInnes, Leland and Healy, John and Melville, James}, + journal={arXiv preprint arXiv:1802.03426}, + year={2018} +} + +@article{townes2019feature, + title={Feature Selection and Dimension Reduction for Single Cell RNA-Seq based on a Multinomial Model}, + author={Townes, F William and Hicks, Stephanie C and Aryee, Martin J and Irizarry, Rafael A}, + journal={bioRxiv}, + pages={574574}, + year={2019}, + publisher={Cold Spring Harbor Laboratory} +} + +@ARTICLE{Soneson2018-hy, + title = "{Bias, robustness and scalability in single-cell differential + expression analysis}", + author = "Soneson, Charlotte and Robinson, Mark D", + abstract = "Many methods have been used to determine differential gene + expression from single-cell RNA (scRNA)-seq data. We evaluated + 36 approaches using experimental and synthetic data and found + considerable differences in the number and characteristics of + the genes that are called differentially expressed. Prefiltering + of lowly expressed genes has important effects, particularly for + some of the methods developed for bulk RNA-seq data analysis. + However, we found that bulk RNA-seq analysis methods do not + generally perform worse than those developed specifically for + scRNA-seq. We also present conquer, a repository of consistently + processed, analysis-ready public scRNA-seq data sets that is + aimed at simplifying method evaluation and reanalysis of + published results. Each data set provides abundance estimates + for both genes and transcripts, as well as quality control and + exploratory analysis reports.", + journal = "Nature methods", + publisher = "Nature Publishing Group, a division of Macmillan Publishers + Limited. All Rights Reserved.", + month = feb, + year = 2018, + url = "http://dx.doi.org/10.1038/nmeth.4612", + issn = "1548-7091", + doi = "10.1038/nmeth.4612" +} + + +@ARTICLE{Finak2015-ow, + title = "{MAST: a flexible statistical framework for assessing + transcriptional changes and characterizing heterogeneity in + single-cell RNA sequencing data}", + author = "Finak, Greg and McDavid, Andrew and Yajima, Masanao and Deng, + Jingyuan and Gersuk, Vivian and Shalek, Alex K and Slichter, + Chloe K and Miller, Hannah W and McElrath, M Juliana and Prlic, + Martin and Linsley, Peter S and Gottardo, Raphael", + abstract = "Single-cell transcriptomics reveals gene expression heterogeneity + but suffers from stochastic dropout and characteristic bimodal + expression distributions in which expression is either strongly + non-zero or non-detectable. We propose a two-part, generalized + linear model for such bimodal data that parameterizes both of + these features. We argue that the cellular detection rate, the + fraction of genes expressed in a cell, should be adjusted for as + a source of nuisance variation. Our model provides gene set + enrichment analysis tailored to single-cell data. It provides + insights into how networks of co-expressed genes evolve across an + experimental treatment. MAST is available at + https://github.com/RGLab/MAST .", + journal = "Genome biology", + volume = 16, + number = 1, + pages = "1--13", + year = 2015, + url = "http://dx.doi.org/10.1186/s13059-015-0844-5", + issn = "1465-6906, 1474-760X", + doi = "10.1186/s13059-015-0844-5" +} + + +@ARTICLE{Bais2019-wv, + title = "{scds: Computational Annotation of Doublets in Single-Cell RNA + Sequencing Data}", author = "Bais, Abha S and Kostka, Dennis", abstract = "MOTIVATION: Single-cell RNA sequencing (scRNA-seq) technologies enable the study of transcriptional heterogeneity at the @@ -1166,4 +1350,9 @@ doi = {10.18637/jss.v059.i10} month = sep, year = 2019, language = "en" + url = "http://dx.doi.org/10.1093/bioinformatics/btz698", + language = "en", + issn = "1367-4803, 1367-4811", + pmid = "31501871", + doi = "10.1093/bioinformatics/btz698" } diff --git a/course_files/clust-intro.Rmd b/course_files/clust-intro.Rmd index 47b126b85debfd24bd6f7a6f0365ea6e8601732e..c2061aac62f08040c66d76d70769a874f3dc0401 100644 --- a/course_files/clust-intro.Rmd +++ b/course_files/clust-intro.Rmd @@ -46,7 +46,7 @@ that it is typically much easier to visualize the data in a 2 or * Scalability: in the last few years the number of cells in scRNA-seq experiments has grown by several orders of magnitude from ~$10^2$ to ~$10^6$ -### unsupervised Clustering methods +### Unsupervised clustering methods Three main ingredients of a complete clustering method: @@ -108,7 +108,7 @@ to scRNA-seq data by building a graph where each vertice represents a cell and (weight of) the edge measures similarity between two cells. Actually, graph-based clustering is the most popular clustering algorithm in scRNA-seq data analysis, and has been reported to have outperformed other -clustering methods in many situations (ref). +clustering methods in many situations [@freytag2018comparison]. ##### Why do we want to represent the data as a graph?\ @@ -123,12 +123,12 @@ clustering methods in many situations (ref). - __Step2__: Add weights, and obtain a shared nearest neighbour (__SNN__) graph -<center>{width= 4%}</center> +<center>{width=40%}</center> There are two ways of adding weights: number and rank.\ - _number_: The number of shared nodes between $u$ and $v$, in this case, 3. \ -- _rank_: A measurement of the closeness to their common nearest neighbours. (ref) \ +- _rank_: A measurement of the closeness to their common nearest neighbours. (@xu2015identification) \ <font color="#bf812d"> @@ -145,28 +145,37 @@ $$ w(u, v) = K - s(u, v).$$ ##### Quality function (Modularity)\ -Modularity is not the only quality function for graph-based clustering, +Modularity [@newman2004finding] is not the only quality function for graph-based clustering, but it is one of the first attempts to embed in a compact form many questions including -<font color="red"> ... </font>.\ +the definition of quality function and null model etc.\ __The idea of modularity__: A random graph should not have a cluster structure. \ The more "quality" a partition has compared to a random graph, the "better" the partition is.\ Specifically, it is defined by: the <font color="#bf812d"> quality </font> of a partition on the actual graph $-$ the quality of the same partition on a <font color="#bf812d"> random graph </font> - + <font color="#bf812d"> quality </font>: Sum of the weights within clusters \ <font color="#bf812d"> random graph </font>: a copy of the original graph, with some of its properties, but without community structure. The random graph defined by modularity is: each node has the same degree as the original graph. - $$ Q \propto \sum_{i, j} A_{i, j} \delta(i, j) - \sum_{i, j} \dfrac{k_i k_j}{2m} \delta(i, j)$$ -<font color="red"> [notations] </font> + $$ Q \propto \sum_{i, j} A_{i, j} \delta(i, j) - \sum_{i, j} \dfrac{k_i k_j}{2m} \delta(i, j)$$ + +- $A_{i, j}$: weight between node $i$ and $j$; + +- $\delta(i, j)$: indicator of whether $i$ and $j$ are in the same cluster; + +- $k_i$: the degree of node $i$ (the sum of weights of all edges connected to $i$); + +- $m$: the total weight in the all graph. + + __Higher modularity implies better partition__: <center>{width=80%}</center> -__Limits of modularity__: \ +__Limits of modularity__: [@good2010performance]\ 1. Resolution limit. \ Short version: Modularity maximization forces small communities into larger ones. \ Longer version: For two clusters $A$ and $B$, if $k_A k_B < 2m$ then modularity increases by merging A and B into a single cluster, even if A and B are distinct clusters.\ @@ -182,12 +191,13 @@ __Limits of modularity__: \ Modularity-based clustering methods implemented in single cell analysis are mostly greedy algorithms, that are very fast, although not the most accurate approaches. - __Louvain__: + __Louvain__: [@blondel2008fast] <center>{width=80%}</center> - __Leiden__: Improved Louvain, hybrid of greedy algorithm and sampling technique \ + __Leiden__:[@traag2019louvain] \ + Improved Louvain, hybrid of greedy algorithm and sampling technique \ ##### __Advantages__: \ -Fast \ @@ -199,7 +209,7 @@ that are very fast, although not the most accurate approaches. -#### Concensus clustering (more robustness, less computational speed) +#### Consensus clustering (more robustness, less computational speed) ##### __Motivation (Two problems of $K$-means)__: \ - __Problem1:__ sensitive to initial partitions \ @@ -209,7 +219,7 @@ that are very fast, although not the most accurate approaches. __Solution:__ Run $K$-means with a range of $K$'s. -##### __Algorithm of concensus clustering (simpliest version)__: +##### __Algorithm of consensus clustering (simpliest version)__: ```{r, eval = F, highlight = F} for(k in the range of K){ for(each subsample of the data){ @@ -242,7 +252,7 @@ Say we partitioned four data points into 2 clusters. <center>{width=60%}</center> -- __Step2:__ Concensus matrix: \ +- __Step2:__ Consensus matrix: \ Average of all the partitions <center>{width=30%}</center> diff --git a/course_files/clustering.Rmd b/course_files/clustering.Rmd index 48d434cf1889f9784bd39d9f5145cc88b25e3bd7..89cc2a7ba70e675151b8e42c7e6d3bbe8b2063ee 100644 --- a/course_files/clustering.Rmd +++ b/course_files/clustering.Rmd @@ -59,14 +59,14 @@ Perform Louvain clustering: ```{r clustering} cl <- igraph::cluster_louvain(deng15)$membership colData(deng)$cl <- factor(cl) -mclust::adjustedRandIndex(colData(deng)$cell_type1, colData(deng)$cl) +mclust::adjustedRandIndex(colData(deng)$cell_type2, colData(deng)$cl) ``` Reaches very high similarity with the labels provided in the original paper. However, it tend to merge small clusters into larger ones. ```{r} -table(deng$cell_type1, cl) +table(deng$cell_type2, cl) ``` @@ -94,19 +94,15 @@ table(muraro$cell_type1, cl) Let's run `SC3` clustering on the Deng data. The advantage of the `SC3` is that it can directly ingest a `SingleCellExperiment` object. -Now let's image we do not know the number of clusters _k_ (cell types). `SC3` can estimate a number of clusters for you: -```{r, eval= F} +`SC3` can estimate a number of clusters: +```{r} deng <- sc3_estimate_k(deng) metadata(deng)$sc3$k_estimation ``` -Interestingly, the number of cell types predicted by `SC3` is smaller than in the original data annotation. However, early, mid and late stages of different cell types together, we will have exactly 6 cell types. We store the merged cell types in `cell_type1` column of the `colData` slot: -```{r, eval= F} -plotPCA(deng, colour_by = "cell_type1") -``` -Now we are ready to run `SC3` (we also ask it to calculate biological properties of the clusters): -```{r, eval= F} +Next we run `SC3` (we also ask it to calculate biological properties of the clusters): +```{r} deng <- sc3(deng, ks = 10, biology = TRUE, n_cores = 1) ``` @@ -118,27 +114,27 @@ sc3_plot_consensus(deng, k = 10, show_pdata = "cell_type2") ``` Silhouette plot: -```{r, fig.height=9, eval= F} +```{r, fig.height=9} sc3_plot_silhouette(deng, k = 10) ``` Heatmap of the expression matrix: -```{r, fig.height=6, eval= F} +```{r, fig.height=6} sc3_plot_expression(deng, k = 10, show_pdata = "cell_type2") ``` Identified marker genes: -```{r, fig.height=11, eval= F} +```{r, fig.height=11} sc3_plot_markers(deng, k = 10, show_pdata = "cell_type2") ``` PCA plot with highlighted `SC3` clusters: -```{r, eval= F} +```{r} plotPCA(deng, colour_by = "sc3_10_clusters") ``` Compare the results of `SC3` clustering with the original publication cell type labels: -```{r, eval= F} +```{r} adjustedRandIndex(colData(deng)$cell_type2, colData(deng)$sc3_10_clusters) ``` @@ -176,7 +172,9 @@ __Note__ Due to direct calculation of distances `SC3` becomes very slow when the <center> {width=80%} </center> - __Step4. Fine tuning__\ We stop here and assign each cell with label that score the highest, actually, if we set the argument ```fine.tune = FALSE```, that is exactly what the package function ```SingleR``` does. - But there is one more question, what if the second highest score is very close to the highest? + But there is one more question, what if the second highest score is very close to the highest? say, 1, 1, 1, 9.5, 10. + `SingleR` set a threshold to define how close is "very close", the default is 0.05. + For (only) the cells that falls into this category, it goes back to Step2. #### Example @@ -314,4 +312,4 @@ plot( sessionInfo() ``` -Among the 2126 cells in the data, only 89 are annotated as different labels as the + diff --git a/course_files/confounders-reads.Rmd b/course_files/confounders-reads.Rmd index 88cfb32434fc1cdb9e20b0e477ce7bb24fc41664..7c4b66041ac100a7d0d8b08957499d3924f73674 100644 --- a/course_files/confounders-reads.Rmd +++ b/course_files/confounders-reads.Rmd @@ -3,7 +3,7 @@ knit: bookdown::preview_chapter --- ```{r setup, echo=FALSE} -knitr::opts_chunk$set(out.width='90%', fig.align = 'center', echo=FALSE, eval=TRUE) +knitr::opts_chunk$set(out.width='90%', fig.align = 'center', echo=FALSE, eval=TRUE, warning=FALSE, message=FALSE) knitr::opts_knit$set(root.dir = normalizePath("..")) ``` diff --git a/course_files/confounders.Rmd b/course_files/confounders.Rmd index 3a4b5db87145c8570563801602b0c50fce3333fa..aabc57800cd68422bea25ce66993c8322ce24426 100644 --- a/course_files/confounders.Rmd +++ b/course_files/confounders.Rmd @@ -4,7 +4,7 @@ knit: bookdown::preview_chapter ```{r, echo=FALSE} library(knitr) -opts_chunk$set(out.width='90%', fig.align = 'center', eval=TRUE) +opts_chunk$set(out.width='90%', fig.align = 'center', eval=TRUE, warning=FALSE, message=FALSE) knitr::opts_knit$set(root.dir = normalizePath("..")) ``` diff --git a/course_files/de-intro.Rmd b/course_files/de-intro.Rmd index 3cd273e5acfb57b5135de837ab5e3621880e4059..6504843920eb373d7b5dc5e76180f358ee076f5d 100644 --- a/course_files/de-intro.Rmd +++ b/course_files/de-intro.Rmd @@ -13,38 +13,95 @@ knitr::opts_knit$set(root.dir = normalizePath("..")) ### Bulk RNA-seq -One of the most common types of analyses when working with bulk RNA-seq -data is to identify differentially expressed genes. By comparing the -genes that change between two conditions, e.g. mutant and wild-type or -stimulated and unstimulated, it is possible to characterize the -molecular mechanisms underlying the change. - -Several different methods, -e.g. [DESeq2](https://bioconductor.org/packages/DESeq2) and -[edgeR](https://bioconductor.org/packages/release/bioc/html/edgeR.html), -have been developed for bulk RNA-seq. Moreover, there are also -extensive +One of the most common types of analyses when working with bulk RNA-seq data is +to identify differentially expressed genes. By comparing the genes that change +between two or more conditions, e.g. mutant and wild-type or stimulated and +unstimulated, it is possible to characterize the molecular mechanisms underlying +the change. + +Several different methods, e.g. +[edgeR](https://bioconductor.org/packages/release/bioc/html/edgeR.html) and +[DESeq2](https://bioconductor.org/packages/DESeq2) and more, have been developed +for bulk RNA-seq and become established as parts of robust and widely-used +analysis workflows. Moreover, there are also extensive [datasets](http://genomebiology.biomedcentral.com/articles/10.1186/gb-2013-14-9-r95) -available where the RNA-seq data has been validated using -RT-qPCR. These data can be used to benchmark DE finding algorithms and the available evidence suggests that the algorithms are performing quite well. +available where the RNA-seq data has been validated using RT-qPCR. These data +can be used to benchmark DE finding algorithms and the available evidence +suggests that the algorithms are performing well. + ### Single cell RNA-seq -In contrast to bulk RNA-seq, in scRNA-seq we usually do not have a defined -set of experimental conditions. Instead, as was shown in a previous chapter +In contrast to bulk RNA-seq, in scRNA-seq we often do not have a defined set +of experimental conditions. Instead, as was shown in a previous chapter (\@ref(clust-methods)) we can identify the cell groups by using an unsupervised -clustering approach. Once the groups have been identified one can find differentially -expressed genes either by comparing the differences in variance between the groups (like the Kruskal-Wallis test implemented in SC3), or by comparing gene expression between clusters in a pairwise manner. In the following chapter we will mainly consider tools developed for pairwise comparisons. +clustering approach. Once the groups have been identified one can find +differentially expressed genes either by comparing the differences in variance +between the groups (like the Kruskal-Wallis test implemented in SC3), or by +comparing gene expression between clusters in a pairwise manner. In the +following chapter we will mainly consider tools developed for pairwise +comparisons. + +These method may also be applied when comparing cells obtained from different +groups or conditions. Such analyses can be complicated by differing cell type +proportions between samples (i.e. distinct samples cell populations; the unit of +replication in the study). In such cases, it is likely beneficial to identify +distinct cell types and conduct differential expression testing between +conditions within each cell type. + ### Differences in Distribution -Unlike bulk RNA-seq, we generally have a large number of samples (i.e. cells) for each group we are comparing in single-cell experiments. Thus we can take advantage of the whole distribution of expression values in each group to identify differences between groups rather than only comparing estimates of mean-expression as is standard for bulk RNASeq. +Unlike bulk RNA-seq, we generally have a large number of samples (i.e. cells) +for each group we are comparing in single-cell experiments. Thus we may be able +to take advantage of the whole distribution of expression values in each group +to identify differences between groups rather than only comparing estimates of +mean-expression as is standard for bulk RNASeq. + +There are two main approaches to comparing distributions. Firstly, we can use +existing statistical models/distributions and fit the same type of model to the +expression in each group then test for differences in the parameters for each +model, or test whether the model fits better if a particular parameter is allowed +to be different according to group. For instance in Chapter +\@ref(dealing-with-confounders) we used `edgeR` to test whether allowing mean +expression to be different in different batches significantly improved the fit +of a negative binomial model of the data. + +Alternatively, we can use a non-parametric test which does not assume that +expression values follow any particular distribution, e.g. the +[Kolmogorov-Smirnov test +(KS-test)](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test). +Non-parametric tests generally convert observed expression values to ranks and +test whether the distribution of ranks for one group are signficantly different +from the distribution of ranks for the other group. However, some non-parametric +methods fail in the presence of a large number of tied values, such as the case +for dropouts (zeros) in single-cell RNA-seq expression data. Moreover, if the +conditions for a parametric test hold, then it will typically be more powerful +than a non-parametric test. + + +### Benchmarking of DE methods for scRNA-seq data + +So far there has been one high-quality benchmarking study of single-cell +differential expression methods [@Soneson2018-hy]. The figure below summarises +the results from that paper (which is well worth reading in full!): + +```{r de-benchmarking, out.width='90%', fig.cap="Figure 5 reproduced from Soneson and Robinson (2018). Summary of DE method performance across all major evaluation criteria. Criteria and cutoff values for performance categories are available in the Online Methods. Methods are ranked by their average performance across the criteria, with the numerical encoding good = 2, intermediate = 1, poor = 0. NODES and SAMseq do not return nominal P values and were therefore not evaluated in terms of the FPR."} +knitr::include_graphics("figures/soneson-de-benchmark-fig5.png") +``` -There are two main approaches to comparing distributions. Firstly, we can use existing statistical models/distributions and fit the same type of model to the expression in each group then test for differences in the parameters for each model, or test whether the model fits better if a particular paramter is allowed to be different according to group. For instance in Chapter \@ref(dealing-with-confounders) we used edgeR to test whether allowing mean expression to be different in different batches significantly improved the fit of a negative binomial model of the data. +One particularly surprising outcome of this benchmarking study is that almost +all methods designed specifically for the analysis of scRNA-seq data are +outperformed by established bulk RNA-seq DE methods (edgeR, limma) and standard, +classical statistical methods (t-test, Wilcoxon rank-sum tests). MAST +[@Finak2015-ow] is the only method designed specifically for scRNA-seq data that +performs well in this benchmark. These benchmarking results are a credit to the +durability and flexibility of the leading bulk RNA-seq DE methods and a subtle +indictment of the land rush of new scRNA-seq methods that were published without +adequate comparison to existing bulk RNA-seq methods. -Alternatively, we can use a non-parametric test which does not assume that expression values follow any particular distribution, e.g. the [Kolmogorov-Smirnov test (KS-test)](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test). Non-parametric tests generally convert observed expression values to ranks and test whether the distribution of ranks for one group are signficantly different from the distribution of ranks for the other group. However, some non-parametric methods fail in the presence of a large number of tied values, such as the case for dropouts (zeros) in single-cell RNA-seq expression data. Moreover, if the conditions for a parametric test hold, then it will typically be more powerful than a non-parametric test. -### Models of single-cell RNASeq data +### Models of single-cell RNA-seq data The most common model of RNASeq data is the negative binomial model: @@ -67,9 +124,21 @@ $\mu = mu$ Variance: $\sigma^2 = mu + mu^2/size$ -It is parameterized by the mean expression (mu) and the dispersion (size), which is inversely related to the variance. The negative binomial model fits bulk RNA-seq data very well and it is used for most statistical methods designed for such data. In addition, it has been show to fit the distribution of molecule counts obtained from data tagged by unique molecular identifiers (UMIs) quite well ([Grun et al. 2014](http://www.nature.com/nmeth/journal/v11/n6/full/nmeth.2930.html), [Islam et al. 2011](http://genome.cshlp.org/content/21/7/1160)). - -However, a raw negative binomial model does not fit full-length transcript data as well due to the high dropout rates relative to the non-zero read counts. For this type of data a variety of zero-inflated negative binomial models have been proposed (e.g. [MAST](https://bioconductor.org/packages/release/bioc/html/MAST.html), [SCDE](https://bioconductor.org/packages/release/bioc/html/scde.html)). +It is parameterized by the mean expression (mu) and the dispersion (size), which +is inversely related to the variance. The negative binomial model fits bulk +RNA-seq data very well and it is used for most statistical methods designed for +such data. In addition, it has been show to fit the distribution of molecule +counts obtained from data tagged by unique molecular identifiers (UMIs) quite +well ([Grun et al. +2014](http://www.nature.com/nmeth/journal/v11/n6/full/nmeth.2930.html), [Islam +et al. 2011](http://genome.cshlp.org/content/21/7/1160)). + +However, a raw negative binomial model does not necessarily fit full-length +transcript data as well due to the high dropout rates relative to the non-zero +read counts. For this type of data a variety of zero-inflated negative binomial +models have been proposed (e.g. +[MAST](https://bioconductor.org/packages/release/bioc/html/MAST.html), +[SCDE](https://bioconductor.org/packages/release/bioc/html/scde.html)). ```{r zero-inflation-plot, fig.cap="Zero-inflated Negative Binomial distribution"} d <- 0.5; @@ -92,9 +161,20 @@ $\mu = mu \cdot (1 - d)$ Variance: $\sigma^2 = \mu \cdot (1-d) \cdot (1 + d \cdot \mu + \mu / size)$ -These models introduce a new parameter $d$, for the dropout rate, to the negative binomial model. As we saw in Chapter 19, the dropout rate of a gene is strongly correlated with the mean expression of the gene. Different zero-inflated negative binomial models use different relationships between mu and d and some may fit $\mu$ and $d$ to the expression of each gene independently. - -Finally, several methods use a Poisson-Beta distribution which is based on a mechanistic model of transcriptional bursting. There is strong experimental support for this model ([Kim and Marioni, 2013](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2013-14-1-r7)) and it provides a good fit to scRNA-seq data but it is less easy to use than the negative-binomial models and much less existing methods upon which to build than the negative binomial model. +These models introduce a new parameter $d$, for the dropout rate, to the +negative binomial model. As we saw in Chapter 19, the dropout rate of a gene is +strongly correlated with the mean expression of the gene. Different +zero-inflated negative binomial models use different relationships between mu +and d and some may fit $\mu$ and $d$ to the expression of each gene +independently. + +Finally, several methods use a Poisson-Beta distribution which is based on a +mechanistic model of transcriptional bursting. There is strong experimental +support for this model ([Kim and Marioni, +2013](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2013-14-1-r7)) +and it provides a good fit to scRNA-seq data but it is less easy to use than the +negative-binomial models and much less existing methods upon which to build than +the negative binomial model. ```{r pois-beta-plot, fit.cap="Poisson-Beta distribution"} a <- 0.1 @@ -115,9 +195,17 @@ $\mu = g \cdot a / (a + b)$ Variance: $\sigma^2 = g^2 \cdot a \cdot b/((a + b + 1) \cdot (a + b)^2)$ -This model uses three parameters: $a$ the rate of activation of transcription; $b$ the rate of inhibition of transcription; and $g$ the rate of transcript production while transcription is active at the locus. Differential expression methods may test each of the parameters for differences across groups or only one (often $g$). +This model uses three parameters: $a$ the rate of activation of transcription; +$b$ the rate of inhibition of transcription; and $g$ the rate of transcript +production while transcription is active at the locus. Differential expression +methods may test each of the parameters for differences across groups or only +one (often $g$). -All of these models may be further expanded to explicitly account for other sources of gene expression differences such as batch-effect or library depth depending on the particular DE algorithm. +All of these models may be further expanded to explicitly account for other +sources of gene expression differences such as batch-effect or library depth +depending on the particular DE algorithm. -__Exercise__: Vary the parameters of each distribution to explore how they affect the distribution of gene expression. How similar are the Poisson-Beta and Negative Binomial models? +__Exercise__: Vary the parameters of each distribution to explore how they +affect the distribution of gene expression. How similar are the Poisson-Beta and +Negative Binomial models? diff --git a/course_files/de-real.Rmd b/course_files/de-real.Rmd index 4550333a2178cd4a8c7216de231f24fecf18a247..2e566998fd770e6e1470629bd7f70dedda974250 100644 --- a/course_files/de-real.Rmd +++ b/course_files/de-real.Rmd @@ -4,7 +4,7 @@ output: html_document --- ```{r setup, echo=FALSE} -knitr::opts_chunk$set(out.width='90%', fig.align = 'center', eval=TRUE) +knitr::opts_chunk$set(out.width='90%', fig.align = 'center', eval=FALSE) knitr::opts_knit$set(root.dir = normalizePath("..")) ``` diff --git a/course_files/exprs-norm-reads.Rmd b/course_files/exprs-norm-reads.Rmd index beab86df25ca88b711965457c68aa5ae9ac15da2..ac95675fa5d621ec323910ac887a784bfac0f603 100644 --- a/course_files/exprs-norm-reads.Rmd +++ b/course_files/exprs-norm-reads.Rmd @@ -3,7 +3,7 @@ output: html_document --- ```{r setup, echo=FALSE} -knitr::opts_chunk$set(out.width='90%', fig.align = 'center', echo=FALSE, eval=TRUE) +knitr::opts_chunk$set(out.width='90%', fig.align = 'center', echo=FALSE, eval=TRUE, warning=FALSE, message=FALSE) knitr::opts_knit$set(root.dir = normalizePath("..")) ``` @@ -80,7 +80,7 @@ plotRLE( ) ``` -```{r norm-ours-sctransform-reads} +```{r norm-ours-sctransform-reads, results='hide'} umi_sparse <- as(counts(reads.qc), "dgCMatrix") ### Genes expressed in at least 5 cells will be kept sctnorm_data <- sctransform::vst(umi = umi_sparse, min_cells = 1, diff --git a/course_files/exprs-norm.Rmd b/course_files/exprs-norm.Rmd index ce319c44788f548aa090271381c67fbdb4e22748..e0ad272a26116e2029d46b424642d2f9d75e1777 100644 --- a/course_files/exprs-norm.Rmd +++ b/course_files/exprs-norm.Rmd @@ -3,7 +3,7 @@ output: html_document --- ```{r setup, echo=FALSE} -knitr::opts_chunk$set(out.width='90%', fig.align = 'center', eval=TRUE) +knitr::opts_chunk$set(out.width='90%', fig.align = 'center', eval=TRUE, warning=FALSE, message=FALSE) knitr::opts_knit$set(root.dir = normalizePath("..")) ``` @@ -459,13 +459,15 @@ Note that (due to what looks like a bug in this version of `sctransform`) we need to convert the UMI count matrix to a sparse format to apply sctransform. -```{r sctransform-apply} +```{r sctransform-apply, warning=FALSE, message=FALSE, results='hide'} umi_sparse <- as(counts(umi.qc), "dgCMatrix") ### Genes expressed in at least 5 cells will be kept sctnorm_data <- sctransform::vst(umi = umi_sparse, min_cells = 1, cell_attr = as.data.frame(colData(umi.qc)), latent_var = "log10_total_counts_endogenous") +``` +```{r sctransform-add-to-sce, warning=FALSE, message=FALSE} ## Pearson residuals, or deviance residuals dim(sctnorm_data$y) dim(umi.qc) @@ -475,7 +477,7 @@ assay(umi.qc, "sctrans_norm") <- sctnorm_data$y Let us look at the NB GLM model parameters estimated by sctransform. -```{r sctransform-params-plot} +```{r sctransform-params-plot, warning=FALSE, message=FALSE} #sce$log10_total_counts ## Matrix of estimated model parameters per gene (theta and regression coefficients) sctransform::plot_model_pars(sctnorm_data) @@ -484,14 +486,14 @@ sctransform::plot_model_pars(sctnorm_data) We can look at the effect of sctransform's normalization on three particular genes, ACTB, POU5F1 (aka OCT4) and CD74. -```{r sctransform-genes-plot} +```{r sctransform-genes-plot, warning=FALSE, message=FALSE} ##c('ACTB', 'Rpl10', 'Cd74') genes_plot <- c("ENSG00000075624", "ENSG00000204531", "ENSG00000019582") sctransform::plot_model(sctnorm_data, umi_sparse, genes_plot, plot_residual = TRUE, cell_attr = as.data.frame(colData(umi.qc))) ``` -```{r norm-pca-sctransform, fig.cap = "PCA plot of the tung data after sctransform normalisation (Pearson residuals)."} +```{r norm-pca-sctransform, warning=FALSE, message=FALSE, fig.cap = "PCA plot of the tung data after sctransform normalisation (Pearson residuals)."} reducedDim(umi.qc, "PCA_sctrans_norm") <- reducedDim( runPCA(umi.qc[endog_genes, ], exprs_values = "sctrans_norm") ) @@ -504,7 +506,7 @@ plotReducedDim( ) + ggtitle("PCA plot: sctransform normalization") ``` -```{r norm-ours-rle-sctransform, fig.cap = "Cell-wise RLE of the tung data"} +```{r norm-ours-rle-sctransform, warning=FALSE, message=FALSE, fig.cap = "Cell-wise RLE of the tung data"} plotRLE( umi.qc[endog_genes, ], exprs_values = "sctrans_norm", @@ -541,7 +543,7 @@ instead of transcripts. `scater` uses the [biomaRt](https://bioconductor.org/packages/release/bioc/html/biomaRt.html) package, which allows one to annotate genes by other attributes: -```{r, message = FALSE, warning = FALSE} +```{r, message = FALSE, warning = FALSE, eval=FALSE} umi.qc <- getBMFeatureAnnos( umi.qc, filters = "ensembl_gene_id", @@ -579,19 +581,19 @@ umi.qc <- getBMFeatureAnnos( Some of the genes were not annotated, therefore we filter them out: -```{r} +```{r, eval=FALSE} umi.qc.ann <- umi.qc[!is.na(rowData(umi.qc)$ensembl_gene_id), ] ``` Now we compute the total gene length in Kilobases by using the `end_position` and `start_position` fields: -```{r} +```{r, eval=FALSE} eff_length <- abs(rowData(umi.qc.ann)$end_position - rowData(umi.qc.ann)$start_position) / 1000 ``` -```{r length-vs-mean, fig.cap = "Gene length vs Mean Expression for the raw data"} +```{r length-vs-mean, eval=FALSE, fig.cap = "Gene length vs Mean Expression for the raw data"} plot(eff_length, rowMeans(counts(umi.qc.ann))) ``` @@ -608,12 +610,12 @@ page](https://www.biostars.org/p/83901/). Now we are ready to perform the normalisations: -```{r} +```{r, eval=FALSE} tpm(umi.qc.ann) <- log2(calculateTPM(umi.qc.ann, eff_length) + 1) ``` Plot the results as a PCA plot: -```{r norm-pca-fpkm, fig.cap = "PCA plot of the tung data after TPM normalisation"} +```{r norm-pca-fpkm, eval=FALSE, fig.cap = "PCA plot of the tung data after TPM normalisation"} tmp <- runPCA( umi.qc.ann, exprs_values = "tpm", @@ -626,11 +628,11 @@ plotPCA( ) ``` -```{r} +```{r, eval=FALSE} tpm(umi.qc.ann) <- log2(calculateFPKM(umi.qc.ann, eff_length) + 1) ``` -```{r norm-pca-tpm, fig.cap = "PCA plot of the tung data after FPKM normalisation"} +```{r norm-pca-tpm, eval=FALSE, fig.cap = "PCA plot of the tung data after FPKM normalisation"} tmp <- runPCA( umi.qc.ann, exprs_values = "tpm", diff --git a/course_files/exprs-qc.Rmd b/course_files/exprs-qc.Rmd index b65355f0ac0188f6f2ea383719b387b414a79dca..de859afae2c457558e451439cd79786b5e5b353a 100644 --- a/course_files/exprs-qc.Rmd +++ b/course_files/exprs-qc.Rmd @@ -346,7 +346,7 @@ We demonstrate the usage of two of these doublet detection tools. ### scds -`scds`[@Bais2019-hf] has two detection methods: +`scds`[@Bais2019-wv] has two detection methods: 1) co-expression based; 2) binary-classification based. diff --git a/course_files/feature-selection.Rmd b/course_files/feature-selection.Rmd index 965e18f4c6f0a6e40670d5859afbd048bcdd5ebc..6d3e6be51ed07fe91a017a9a777b59757eb77e54 100644 --- a/course_files/feature-selection.Rmd +++ b/course_files/feature-selection.Rmd @@ -4,7 +4,7 @@ output: html_document --- ```{r setup, echo=FALSE} -knitr::opts_chunk$set(out.width='90%', fig.align = 'center', eval=TRUE) +knitr::opts_chunk$set(out.width='90%', fig.align = 'center', eval=TRUE, warning=FALSE, message=FALSE) knitr::opts_knit$set(root.dir = normalizePath("..")) ``` @@ -16,6 +16,9 @@ library(matrixStats) library(M3Drop) library(RColorBrewer) library(SingleCellExperiment) +library(Polychrome) +library(scater) +library(scran) set.seed(1) ``` @@ -47,7 +50,8 @@ For this section we will continue working with the Deng data. ```{r} deng <- readRDS("data/deng/deng-reads.rds") celltype_labs <- colData(deng)$cell_type2 -cell_colors <- brewer.pal(max(3,length(unique(celltype_labs))), "Set3") +cell_colors <- createPalette(10, c("#010101", "#ff0000"), M=1000) +names(cell_colors) <- unique(as.character(celltype_labs)) ``` Feature selection is performed after QC, however this data has already been QCed so @@ -87,12 +91,13 @@ first is to identify genes which behave differently from a null model describing just the technical noise expected in the dataset. If the dataset contains spike-in RNAs they can be used to directly model -technical noise. However, measurements of spike-ins may not experience -the same technical noise as endogenous transcripts [(Svensson et al., 2017)](https://www.nature.com/nmeth/journal/v14/n4/full/nmeth.4220.html). -In addition, scRNASeq experiments often contain only a small number of -spike-ins which reduces our confidence in fitted model parameters. +technical noise. However, measurements of spike-ins may not experience the same +technical noise as endogenous transcripts [(Svensson et al., +2017)](https://www.nature.com/nmeth/journal/v14/n4/full/nmeth.4220.html). In +addition, scRNASeq experiments often contain only a small number of spike-ins +which reduces our confidence in fitted model parameters. -#### Highly Variable Genes +#### Highly Variable Genes - Brennecke method The first method proposed to identify features in scRNASeq datasets was to identify highly variable genes (HVG). HVG assumes that if genes @@ -117,20 +122,22 @@ plot( main="" ) ``` -A popular method to correct for the relationship between variance and mean expression -was proposed by [Brennecke et al.](http://www.nature.com/nmeth/journal/v10/n11/full/nmeth.2645.html). -To use the Brennecke method, we first normalize for library size then calculate -the mean and the square coefficient of variation (variation divided by -the squared mean expression). A quadratic curve is fit to the relationship -between these two variables for the ERCC spike-in, and then a chi-square test is used to find genes -significantly above the curve. This method is included in the M3Drop package as the -Brennecke_getVariableGenes(counts, spikes) function. However, this dataset does not contain spike-ins -so we will use the entire dataset to estimate the technical noise. - -In the figure below the red curve -is the fitted technical noise model and the dashed line is the 95% -CI. Pink dots are the genes with significant biological variability -after multiple-testing correction. + +An early method to correct for the relationship between variance and mean +expression was proposed by [Brennecke et +al.](http://www.nature.com/nmeth/journal/v10/n11/full/nmeth.2645.html). To use +the Brennecke method, we first normalize for library size then calculate the +mean and the square coefficient of variation (variation divided by the squared +mean expression). A quadratic curve is fit to the relationship between these two +variables for the ERCC spike-in, and then a chi-square test is used to find +genes significantly above the curve. This method is included in the M3Drop +package as the Brennecke_getVariableGenes(counts, spikes) function. However, +this dataset does not contain spike-ins so we will use the entire dataset to +estimate the technical noise. + +In the figure below the red curve is the fitted technical noise model and the +dashed line is the 95% CI. Pink dots are the genes with significant biological +variability after multiple-testing correction. ```{r, fig.width = 7, fig.height = 6} Brennecke_HVG <- BrenneckeGetVariableGenes( @@ -140,9 +147,10 @@ Brennecke_HVG <- BrenneckeGetVariableGenes( ) ``` -This function returns a matrix of significant genes as well as their estimated effect size (difference -between observed and expected coefficient of variation), and their significance as raw p.values and -FDR corrected q.values. For now we will just keep the names of the significant HVG genes. +This function returns a matrix of significant genes as well as their estimated +effect size (difference between observed and expected coefficient of variation), +and their significance as raw p.values and FDR corrected q.values. For now we +will just keep the names of the significant HVG genes. ```{r} HVG_genes <- Brennecke_HVG$Gene @@ -154,23 +162,73 @@ How many genes were signifcant using BrenneckeGetVariableGenes? ```{r, echo=FALSE, fig.width = 8.5, fig.height = 6} length(HVG_genes) ``` + +#### Highly Variable Genes - simpleSingleCell method + +The Bioconductor +[simpleSingleCell](https://bioconductor.org/packages/release/workflows/html/simpleSingleCell.html) +workflow has a great deal of excellent material to help your analyses. Here, we +show how to identify highly variable genes using functionality from the `scran` +package. + +This method assumes that technical variance is captured by a Poisson +distribution, and that variance beyond that explained by a Poisson distribution +represents biological variance of interest. This approach separates the +biological component of the variance from the technical component and thus can +rank genes based on their "biological" variance. This model also provides +p-values (with FDR adjustment) that can be used to identify the set of +"significant" highly variable genes at a given significance level. + +```{r hvg-simpleSingleCell} +### mamke a technical trend of variance based on Poisson +var.fit <- trendVar(deng, parametric=TRUE, loess.args=list(span=0.4), use.spikes = FALSE) +var.out <- decomposeVar(deng, var.fit) +plot(var.out$mean, var.out$total, pch=16, cex=0.6, xlab="Mean log-expression", + ylab="Variance of log-expression") +points(var.out$mean[isSpike(deng)], var.out$total[isSpike(deng)], col="red", pch=16) +curve(var.fit$trend(x), col="dodgerblue", add=TRUE, lwd=2) + +chosen.genes <- order(var.out$bio, decreasing=TRUE)[1:10] +plotExpression(deng, rownames(var.out)[chosen.genes], + point_alpha=0.5, jitter_type="jitter") + +top.dec <- var.out[order(var.out$bio, decreasing=TRUE),] + # the highly variable genes with largest biological components +head(top.dec) +simplesinglecell_genes <- rownames(top.dec)[top.dec$FDR < 0.001] +table(top.dec$FDR < 0.001) +``` + +If we set an FDR threshold of 0.1%, this approach identifies around 1300 highly +variable genes. + +The output of this variance modelling can be used as input to a `denoisePCA()` +function to compute "denoised" principal components for clustering and other +downstream analyses (details not shown here; please see the `simpleSingleCell` +workflow). + + #### High Dropout Genes -An alternative to finding HVGs is to identify genes with unexpectedly high numbers of zeros. -The frequency of zeros, known as the "dropout rate", is very closely related to expression level -in scRNASeq data. Zeros are the dominant feature of single-cell RNASeq data, typically accounting -for over half of the entries in the final expression matrix. These zeros predominantly result -from the failure of mRNAs failing to be reversed transcribed [(Andrews and Hemberg, 2016)](http://www.biorxiv.org/content/early/2017/05/25/065094). Reverse transcription -is an enzyme reaction thus can be modelled using the Michaelis-Menten equation: +An alternative to finding HVGs is to identify genes with unexpectedly high +numbers of zeros. The frequency of zeros, known as the "dropout rate", is very +closely related to expression level in scRNASeq data. Zeros are the dominant +feature of single-cell RNASeq data, typically accounting for over half of the +entries in the final expression matrix. These zeros predominantly result from +the failure of mRNAs failing to be reversed transcribed [(Andrews and Hemberg, +2016)](http://www.biorxiv.org/content/early/2017/05/25/065094). Reverse +transcription is an enzyme reaction thus can be modelled using the +Michaelis-Menten equation: $$P_{dropout} = 1 - S/(K + S)$$ -where $S$ is the mRNA concentration in the cell (we will estimate this as average expression) -and $K$ is the Michaelis-Menten constant. +where $S$ is the mRNA concentration in the cell (we will estimate this as +average expression) and $K$ is the Michaelis-Menten constant. -Because the Michaelis-Menten equation is a convex non-linear function, genes which are -differentially expression across two or more populations of cells in our dataset will -be shifted up/right of the Michaelis-Menten model (see Figure below). +Because the Michaelis-Menten equation is a convex non-linear function, genes +which are differentially expression across two or more populations of cells in +our dataset will be shifted up/right of the Michaelis-Menten model (see Figure +below). ```{r, fig.width = 8.5, fig.height = 6, echo=TRUE} K <- 49 @@ -205,9 +263,12 @@ points( cex = 3 ) ``` -__Note__: add `log="x"` to the `plot` call above to see how this looks on the log scale, which is used in M3Drop figures. -__Exercise 4__: Produce the same plot as above with different expression levels (S1 & S2) and/or mixtures (mix). +__Note__: add `log="x"` to the `plot` call above to see how this looks on the +log scale, which is used in M3Drop figures. + +__Exercise 4__: Produce the same plot as above with different expression levels +(S1 & S2) and/or mixtures (mix). ```{r, include=FALSE} plot( @@ -287,20 +348,100 @@ How many genes were signifcant using NBumiFeatureSelectionCombinedDrop? nrow(DropFS) ``` +#### Residual variance from a (regularized) negative binomial model + +In the [normalization chapter](#normalization-theory) we introduced the +`sctransform` approach to using Pearson residuals from an regularized negative +binomial generalized linear model to normalize scRNA-seq data. + +The residual variance of genes (i.e. the variance of the Pearson residuals) +provides a way to identify highly variable genes, where the "variance" is +decoupled from the average level of expression of the gene. + +The residual variance is easily accessible from the `sctransform` output as we +show below. + +First, we run `sctransform` as we did previously. + +```{r sctransform-apply, warning=FALSE, message=FALSE, results='hide'} +deng_sparse <- as(counts(deng), "dgCMatrix") +### Genes expressed in at least 5 cells will be kept +sctnorm_data <- sctransform::vst(umi = deng_sparse, min_cells = 1, + cell_attr = as.data.frame(colData(deng)), + latent_var = "log10_total_counts_endogenous") +``` + + +```{r} +sctnorm_data$model_str +``` + + +```{r sctransform-feature-select, warning=FALSE, message=FALSE} +library(ggplot2) +ggplot(sctnorm_data$gene_attr, aes(residual_variance)) + + geom_histogram(binwidth=0.1) + + geom_vline(xintercept=1, color='red') + xlim(0, 10) + + theme_bw() + +sctnorm_data$gene_attr$label <- rownames(sctnorm_data$gene_attr) +ggplot(sctnorm_data$gene_attr, aes(x = gmean, y=residual_variance)) + + geom_point(alpha = 0.6) + + geom_point(colour = "firebrick2", + data = sctnorm_data$gene_attr[sctnorm_data$gene_attr$residual_variance > 3,]) + + scale_x_log10() + + geom_hline(yintercept = 1, size = 3, color = "dodgerblue") + + geom_label(aes(label = label), + data = sctnorm_data$gene_attr[sctnorm_data$gene_attr$residual_variance > 30,]) + + theme_bw() + +sct_genes <- rownames(sctnorm_data$gene_attr)[sctnorm_data$gene_attr$residual_variance > 4] +table(sctnorm_data$gene_attr$residual_variance > 4) +``` + +If we set a (relatively arbitrary) threshold of a residual variance greater than +three marking a "highly variable gene", then we identify around 2000 highly +variable genes with this `sctransform` approach. + + +[NB: the `deng` data is extremely high depth for scRNA-seq data, so not the most +applicable dataset for `sctransform`, but we include this analysis here to +demonstrate the method rather than make any evaluation of its performance in +general.] + + +Although not explored here, the _deviance_ statistic from the regularized NB GLM +fit provides a natural way to select informative features for downstream +analyses. + +The [deviance](https://en.wikipedia.org/wiki/Deviance_(statistics)) is a +goodness-of-fit statistic for a statistical model. As Wikipedia notes, deviance +is a generalization of the idea of using the sum of squares of residuals in +ordinary least squares to cases where model-fitting is achieved by maximum +likelihood. It plays an important role in exponential dispersion models and +generalized linear models, such as the negative binomial model. + +However, `sctransform` does not seem set up to use the model deviance to select +informative features, but we expect this could be a direction the field goes in +the near future. Keep an eye out! + + ### Correlated Expression -A completely different approach to feature selection is to use gene-gene correlations. This method -is based on the idea that multiple genes will be differentially expressed between different cell-types -or cell-states. Genes which are expressed in the same cell-population will be positively correlated -with each other where as genes expressed in different cell-populations will be negatively correated with -each other. Thus important genes can be identified by the magnitude of their correlation -with other genes. +A completely different approach to feature selection is to use gene-gene +correlations. This method is based on the idea that multiple genes will be +differentially expressed between different cell-types or cell-states. Genes +which are expressed in the same cell-population will be positively correlated +with each other where as genes expressed in different cell-populations will be +negatively correated with each other. Thus important genes can be identified by +the magnitude of their correlation with other genes. -The limitation of this method is that it assumes technical noise is random and independent for each cell, -thus shouldn't produce gene-gene correlations, but this assumption is violated by batch effects which are -generally systematic between different experimental batches and will produce gene-gene correlations. As a -result it is more appropriate to take the top few thousand genes as ranked by gene-gene correlation than -consider the significance of the correlations. +The limitation of this method is that it assumes technical noise is random and +independent for each cell, thus shouldn't produce gene-gene correlations, but +this assumption is violated by batch effects which are generally systematic +between different experimental batches and will produce gene-gene correlations. +As a result it is more appropriate to take the top few thousand genes as ranked +by gene-gene correlation than consider the significance of the correlations. ```{r, eval=FALSE} @@ -308,55 +449,11 @@ cor_feat <- M3Drop::corFS(expr_matrix) Cor_genes <- names(cor_feat)[1:1500] ``` -Lastly, another common method for feature selection in scRNASeq data is to use PCA loadings. Genes with -high PCA loadings are likely to be highly variable and correlated with many other variable genes, thus -may be relevant to the underlying biology. However, as with gene-gene correlations PCA loadings tend to -be susceptible to detecting systematic variation due to batch effects; thus it is recommended to plot the PCA -results to determine those components corresponding to the biological variation rather than batch effects. - -```{r, fig.width=7, fig.height=6} -# PCA is typically performed on log-transformed expression data -pca <- prcomp(log(expr_matrix + 1) / log(2)) -# plot projection -plot( - pca$rotation[,1], - pca$rotation[,2], - pch = 16, - col = cell_colors[as.factor(celltype_labs)] -) -# calculate loadings for components 1 and 2 -score <- rowSums(abs(pca$x[,c(1,2)])) -names(score) <- rownames(expr_matrix) -score <- score[order(-score)] -PCA_genes <- names(score[1:1500]) -``` -__Exercise 6__ -Consider the top 5 principal components. Which appear to be most biologically relevant? How does the top 1,500 -features change if you consider the loadings for those components? -```{r, include=FALSE} -plot( - pca$rotation[,2], - pca$rotation[,3], - pch = 16, - col = cell_colors[as.factor(celltype_labs)] -) -plot( - pca$rotation[,3], - pca$rotation[,4], - pch = 16, - col = cell_colors[as.factor(celltype_labs)] -) -# calculate loadings for components 1 and 2 -score <- rowSums(abs(pca$x[,c(2, 3, 4)])) -names(score) <- rownames(expr_matrix) -score <- score[order(-score)] -PCA_genes2 = names(score[1:1500]) -``` ### Comparing Methods -We can check whether the identified features really do represent genes differentially expressed between -cell-types in this dataset. +We can check whether the identified features really do represent genes +differentially expressed between cell-types in this dataset. ```{r, fig.width = 7, fig.height = 10} M3DropExpressionHeatmap( @@ -366,14 +463,27 @@ M3DropExpressionHeatmap( ) ``` -We can also consider how consistent each feature selection method is with the others using the Jaccard Index: +We can also consider how consistent each feature selection method is with the +others using the Jaccard Index: + ```{r} J <- sum(M3Drop_genes %in% HVG_genes)/length(unique(c(M3Drop_genes, HVG_genes))) ``` -__Exercise 7__ +__Exercise 6__ + +Plot the expression of the features for each of the other methods. Which appear +to be differentially expressed? How consistent are the different methods for +this dataset? + +```{r, eval=FALSE, include=FALSE, fig.width = 7, fig.height = 10} +M3DropExpressionHeatmap( + DANB_genes, + expr_matrix, + cell_labels = celltype_labs +) +``` -Plot the expression of the features for each of the other methods. Which appear to be differentially expressed? How consistent are the different methods for this dataset? ```{r, eval=FALSE, include=FALSE, fig.width = 7, fig.height = 10} M3DropExpressionHeatmap( @@ -393,7 +503,7 @@ M3DropExpressionHeatmap( ```{r, eval=FALSE, include=FALSE, fig.width = 7, fig.height = 10} M3DropExpressionHeatmap( - PCA_genes, + simplesinglecell_genes, expr_matrix, cell_labels = celltype_labs ) @@ -401,19 +511,21 @@ M3DropExpressionHeatmap( ```{r, eval=FALSE, include=FALSE, fig.width = 7, fig.height = 10} M3DropExpressionHeatmap( - PCA_genes2, + sct_genes, expr_matrix, cell_labels = celltype_labs ) ``` -```{r, eval=FALSE, include=FALSE} +Jaccard index comparison of sets of informative features: + +```{r, eval=TRUE, include=TRUE} list_of_features <- list( - M3Drop_genes, - HVG_genes, - Cor_genes, - PCA_genes, - PCA_genes2 + M3Drop_genes, + DANB_genes, + HVG_genes, + simplesinglecell_genes, + sct_genes ) Out <- matrix( 0, @@ -426,7 +538,8 @@ for(i in 1:length(list_of_features) ) { length(unique(c(list_of_features[[i]], list_of_features[[j]]))) } } -colnames(Out) <- rownames(Out) <- c("M3Drop", "HVG", "Cor", "PCA", "PCA2") +colnames(Out) <- rownames(Out) <- c("M3Drop", "DANB", "Brennecke", "simpleSingleCell", "sctransform") +Out ``` diff --git a/course_files/figures/FA.png b/course_files/figures/FA.png index ebbfebffbc374e73c134c2d70734b7a3ddb4970b..5e17a7274d3236635e3d34e780dce290b25c49a0 100644 Binary files a/course_files/figures/FA.png and b/course_files/figures/FA.png differ diff --git a/course_files/figures/FA_matrix.png b/course_files/figures/FA_matrix.png new file mode 100644 index 0000000000000000000000000000000000000000..c2d2b68fcf2024990556f5cf864772bbef309eea Binary files /dev/null and b/course_files/figures/FA_matrix.png differ diff --git a/course_files/figures/rotation2.png b/course_files/figures/rotation2.png index 655a7b3b16da0649ea51250383e02d2216b1f8cf..df72366f4f91ddaa2ecedf2cee587391fe0a2f2d 100644 Binary files a/course_files/figures/rotation2.png and b/course_files/figures/rotation2.png differ diff --git a/course_files/figures/slab_spike.png b/course_files/figures/slab_spike.png new file mode 100644 index 0000000000000000000000000000000000000000..d88d5916719cbcf546e3a1828f7f22513ede3d22 Binary files /dev/null and b/course_files/figures/slab_spike.png differ diff --git a/course_files/figures/slalom_anno.png b/course_files/figures/slalom_anno.png new file mode 100644 index 0000000000000000000000000000000000000000..77a37b361609bab3e0a76603f7c9f61611f4d34d Binary files /dev/null and b/course_files/figures/slalom_anno.png differ diff --git a/course_files/figures/soneson-de-benchmark-fig5.png b/course_files/figures/soneson-de-benchmark-fig5.png new file mode 100644 index 0000000000000000000000000000000000000000..dab02c3998c0c9aff91c3ec70484c6e14bc5f9c9 Binary files /dev/null and b/course_files/figures/soneson-de-benchmark-fig5.png differ diff --git a/course_files/handling-sparsity.Rmd b/course_files/handling-sparsity.Rmd index 19e56d0c3ff266094ded763f8cef674fe482244f..38dfcccf91f533d947380b80f306799a9b29296f 100644 --- a/course_files/handling-sparsity.Rmd +++ b/course_files/handling-sparsity.Rmd @@ -21,8 +21,8 @@ These observed zero values can represent either missing data (i.e.~a gene is exp The term ``dropout'' is often used to denote observed zero values in scRNA-seq data, but this term conflates zero values attributable to methodological noise and biologically-true zero expression, so we recommend against its use as a catch-all term for observed zeros. Sparsity in scRNA-seq data can hinder downstream analyses, but it is challenging to model or handle it appropriately, and thus, there remains an ongoing need for improved methods. -Sparsity pervades all aspects of scRNA-seq data analysis, but here we focus on the linked problems of learning latent spaces and ``imputing'' expression values from scRNA-seq data (\autoref{fig:denoising-imputation}). -Imputation, ``data smoothing'' and ``data reconstruction'' approaches are closely linked to the challenges of normalization. +Sparsity pervades all aspects of scRNA-seq data analysis, but here we focus on the linked problems of learning latent spaces and "imputing" expression values from scRNA-seq data. +Imputation, "data smoothing" and "data reconstruction" approaches are closely linked to the challenges of normalization. But whereas normalization generally aims to make expression values between cells more comparable to each other, imputation and data smoothing approaches aim to achieve adjusted data values that---it is hoped---better represent the true expression values. Imputation methods could therefore be used for normalization, but do not entail all possible or useful approaches to normalization. @@ -33,9 +33,9 @@ The imputation of missing values has been very successful for genotype data. Crucially, when imputing genotypes we often know which data are missing (e.g.~when no genotype call is possible due to no coverage of a locus, although see section \autoref{sec:dna-variation-calling} for the challenges with \ac{scdnaseq} data) and rich sources of external information are available (e.g.~haplotype reference panels). Thus, genotype imputation is now highly accurate and a commonly-used step in data processing for genetic association studies \citep{Das2018-zs}. -The situation is somewhat different for scRNA-seq data, as we do not routinely have external reference information to apply (see \autoref{sec:rna-ref-atlases}). -In addition, we can never be sure which observed zeros represent ``missing data'' and which accurately represent a true gene expression level in the cell \citep{hicks_missing_2018}. -Observed zeros can either represent ``biological'' zeros, i.e.~those present because the true expression level of a gene in a cell was zero. +The situation is somewhat different for scRNA-seq data, as we do not routinely have external reference information to apply. +In addition, we can never be sure which observed zeros represent "missing data" and which accurately represent a true gene expression level in the cell \citep{hicks_missing_2018}. +Observed zeros can either represent "biological" zeros, i.e.~those present because the true expression level of a gene in a cell was zero. Or they they are the result of methodological noise, which can arise when a gene has true non-zero expression in a cell, but no counts are observed due to failures at any point in the complicated process of processing mRNA transcripts in cells into mapped reads. Such noise can lead to artefactual zero that are either more systematic (e.g.~sequence-specific mRNA degradation during cell lysis) or that occur by chance (e.g.~barely expressed transcripts that at the same expression level will sometimes be detected and sometimes not, due to sampling variation, e.g~in the sequencing). The high degree of sparsity in scRNA-seq data therefore arises from technical zeros and true biological zeros, which are difficult to distinguish from one another. @@ -52,13 +52,13 @@ It is therefore desirable to improve both statistical methods that work on spars We define three broad (and sometimes overlapping) categories of methods that can be used to ``impute'' scRNA-seq data in the absence of an external reference: -1. __Model-based imputation methods of technical zeros_ use probabilistic models to identify which observed zeros represent technical rather than biological zeros and aim to impute expression levels just for these technical zeros, leaving other observed expression levels untouched; or -1. __Data-smoothing methods_ define sets of ``similar'' cells (e.g.~cells that are neighbors in a graph or occupy a small region in a latent space) and adjust expression values for each cell based on expression values in similar cells. +1. __Model-based imputation methods of technical zeros__ use probabilistic models to identify which observed zeros represent technical rather than biological zeros and aim to impute expression levels just for these technical zeros, leaving other observed expression levels untouched; or +1. __Data-smoothing methods__ define sets of "similar" cells (e.g.~cells that are neighbors in a graph or occupy a small region in a latent space) and adjust expression values for each cell based on expression values in similar cells. These methods adjust all expression values, including technical zeros, biological zeros and observed non-zero values. -1. __Data-reconstruction methods_ typically aim to define a latent space representation of the cells. +1. __Data-reconstruction methods__ typically aim to define a latent space representation of the cells. This is often done through matrix factorization (e.g.~principal component analysis) or, increasingly, through machine learning approaches (e.g.~variational autoencoders that exploit deep neural networks to capture non-linear relationships). -Although a broad class of methods, both matrix factorization methods and autoencoders (among others) are able to ``reconstruct'' the observed data matrix from low-rank or simplified representations. -The reconstructed data matrix will typically no longer be sparse (with many zeros) and the implicitly ``imputed'' data can be used for downstream applications that cannot handle sparse count data. +Although a broad class of methods, both matrix factorization methods and autoencoders (among others) are able to "reconstruct" the observed data matrix from low-rank or simplified representations. +The reconstructed data matrix will typically no longer be sparse (with many zeros) and the implicitly "imputed" data can be used for downstream applications that cannot handle sparse count data. The first category of methods generally seeks to infer a probabilistic model that captures the data generation mechanism. Such generative models can be used to identify, probabilistically, which observed zeros correspond to technical zeros (to be imputed) and which correspond to biological zeros (to be left alone). @@ -76,9 +76,9 @@ Clustering methods that implicitly impute values, such as CIDR \citep{lin_cidr:_ <!-- \label{fig:denoising-imputation} --> <!-- \end{figure*} --> -Data-smoothing methods, which adjust all gene expression levels based on expression levels in ``similar'' cells, have also been proposed to handle imputation problems. -We might regard these approaches as ``denoising'' methods. -To take a simplified example (\autoref{fig:denoising-imputation}), we might imagine that single cells originally refer to points in two-dimensional space, but are likely to describe a one-dimensional curve; projecting data points onto that curve eventually allows imputation of the ``missing'' values (but all points are adjusted, or smoothed, not just true technical zeros). +Data-smoothing methods, which adjust all gene expression levels based on expression levels in "similar" cells, have also been proposed to handle imputation problems. +We might regard these approaches as "denoising" methods. +To take a simplified example, we might imagine that single cells originally refer to points in two-dimensional space, but are likely to describe a one-dimensional curve; projecting data points onto that curve eventually allows imputation of the "missing" values (but all points are adjusted, or smoothed, not just true technical zeros). Prominent data-smoothing approaches to handling sparse counts include: - diffusion-based MAGIC \citep{dijk_recovering_2018} @@ -89,9 +89,9 @@ Prominent data-smoothing approaches to handling sparse counts include: A major task in the analysis of high-dimensional single-cell data is to find low-dimensional representations of the data that capture the salient biological signals and render the data more interpretable and amenable to further analyses. -As it happens, the matrix factorization and latent-space learning methods used for that task also provide another route for imputation through their ability to \emph{reconstruct} the observed data matrix from simplified representations of it. -\Ac{pca} is one such standard matrix factorization method that can be applied to scRNA-seq data (preferably after suitable data normalization) as are other widely-used general statistical methods like \ac{ica} and \ac{nmf}. -As (linear) matrix factorization methods, \ac{pca}, \ac{ica} and \ac{nmf} decompose the observed data matrix into a ``small'' number of factors in two low-rank matrices, one representing cell-by-factor weights and one gene-by-factor loadings. +As it happens, the matrix factorization and latent-space learning methods used for that task also provide another route for imputation through their ability to _reconstruct_ the observed data matrix from simplified representations of it. +PCA is one such standard matrix factorization method that can be applied to scRNA-seq data (preferably after suitable data normalization) as are other widely-used general statistical methods like ICA and NMF. +As (linear) matrix factorization methods, PCA, ICA and NMF decompose the observed data matrix into a "small" number of factors in two low-rank matrices, one representing cell-by-factor weights and one gene-by-factor loadings. Many matrix factorization methods with tweaks for single-cell data have been proposed in recent years, including: - ZIFA, a zero-inflated factor analysis \citep{pierson_zifa:_2015} diff --git a/course_files/imputation.Rmd b/course_files/imputation.Rmd index 739c9c9d4233c3d2f2e184f87a558264a06fd89d..c6e1200caf09fd80e4c365b8a7997b3a22486f2e 100644 --- a/course_files/imputation.Rmd +++ b/course_files/imputation.Rmd @@ -3,7 +3,7 @@ output: html_document --- ```{r setup, echo=FALSE} -knitr::opts_chunk$set(out.width='90%', fig.align = 'center', eval=TRUE) +knitr::opts_chunk$set(out.width='90%', fig.align = 'center', eval=FALSE) knitr::opts_knit$set(root.dir = normalizePath("..")) ``` diff --git a/course_files/index.Rmd b/course_files/index.Rmd index 233f926027da1ed73501e7647538034dd841396b..7d0aab75df259a70c4274f0000a96966a315ee87 100644 --- a/course_files/index.Rmd +++ b/course_files/index.Rmd @@ -1,6 +1,6 @@ --- title: "Analysis of single cell RNA-seq data" -author: "Davis McCarthy (<a href = 'https://twitter.com/davisjmcc'>davisjmcc</a>), Ruqian Lyu, PuXue Qiao, Vladimir Kiselev (<a href = 'https://twitter.com/wikiselev'>wikiselev</a>), Tallulah Andrews (<a href = 'https://twitter.com/talandrews'>talandrews</a>), Jennifer Westoby (<a href = 'https://twitter.com/Jenni_Westoby'>Jenni_Westoby</a>), Maren Büttner (<a href = 'https://twitter.com/marenbuettner'>marenbuettner</a>), Jimmy Lee (<a href = 'https://twitter.com/THJimmyLee'>THJimmyLee</a>), Krzysztof Polanski, Sebastian Y. Müller, Elo Madissoon, Stephane Ballereau, Maria Do Nascimento Lopes Primo, Rocio Martinez Nunez and Martin Hemberg (<a href = 'https://twitter.com/m_hemberg'>m_hemberg</a>)" +author: "Ruqian Lyu, PuXue Qiao, and Davis J. McCarthy (<a href = 'https://twitter.com/davisjmcc'>davisjmcc</a>)" date: "`r Sys.Date()`" #knit: "bookdown::render_book" documentclass: book @@ -10,6 +10,18 @@ link-citations: yes always_allow_html: yes --- +This version of the course builds on the May 2019 version of the course authored +by: Vladimir Kiselev (<a href = 'https://twitter.com/wikiselev'>wikiselev</a>), +Tallulah Andrews (<a href = 'https://twitter.com/talandrews'>talandrews</a>), +Davis J. McCarthy (<a href = 'https://twitter.com/davisjmcc'>davisjmcc</a>), +Jennifer Westoby (<a href = +'https://twitter.com/Jenni_Westoby'>Jenni_Westoby</a>), Maren Büttner (<a href = +'https://twitter.com/marenbuettner'>marenbuettner</a>), Jimmy Lee (<a href = +'https://twitter.com/THJimmyLee'>THJimmyLee</a>), Krzysztof Polanski, Sebastian +Y. Müller, Elo Madissoon, Stephane Ballereau, Maria Do Nascimento Lopes Primo, +Rocio Martinez Nunez and Martin Hemberg (<a href = +'https://twitter.com/m_hemberg'>m_hemberg</a>) + # About the course <!-- > > <span style="color:red">__Important!__ The course will be run on the __2nd - 3rd October 2019, both days 9:00-17:00 Melbourne, Australia time__. </span> --> diff --git a/course_files/intro.Rmd b/course_files/intro.Rmd index 29d19cea9fbe18403c552c4ea44b76a70d1a523e..5fa4b14d572182656bc5ff83bee018d719416fc3 100644 --- a/course_files/intro.Rmd +++ b/course_files/intro.Rmd @@ -61,10 +61,32 @@ Today, there are also several different platforms available for carrying out one ## Challenges -The main difference between bulk and single cell RNA-seq is that each sequencing library represents a single cell, instead of a population of cells. Therefore, significant attention has to be paid to comparison of the results from different cells (sequencing libraries). The main sources of discrepancy between the libraries are: +The main difference between bulk and single cell RNA-seq is that each sequencing +library represents a single cell, instead of a population of cells. Therefore, +significant attention has to be paid to comparison of the results from different +cells (sequencing libraries). The main sources of discrepancy between the +libraries are: +* __Reverse transcription__ to convert RNA to cDNA is at best <30% efficient * __Amplification__ (up to 1 million fold) -* __Gene 'dropouts'__ in which a gene is observed at a moderate expression level in one cell but is not detected in another cell [@Kharchenko2014-ts]. - -In both cases the discrepancies are introduced due to low starting amounts of transcripts since the RNA comes from one cell only. Improving the transcript capture efficiency and reducing the amplification bias are currently active areas of research. However, as we shall see in this course, it is possible to alleviate some of these issues through proper normalization and corrections. +* __Gene 'dropouts'__ in which a gene is observed at a moderate expression level in one cell but is not detected in another cell [@Kharchenko2014-ts]; this can be due to technical factors (e.g. inefficient RT) or true biological variability across cells. + +These discrepancies are introduced due to low starting amounts of transcripts +since the RNA comes from one cell only. Improving the transcript capture +efficiency and reducing the amplification bias are currently active areas of +research. However, as we shall see in this course, it is possible to alleviate +some of these issues through proper normalization and corrections and effective +statistical models. + +For the analyst, the characteristics of single-cell RNA-seq data lead to +challenges in handling: + +* __Sparsity__ +* __Variability__ +* __Scalability__ +* __Complexity__ + +In this workshop we will present computational approaches that can allow us to +face these challenges as we try to answer biological questions of interest from +single-cell transcriptomic data. diff --git a/course_files/latent-spaces.Rmd b/course_files/latent-spaces.Rmd index c3861628c837088b9ec0d0f46d887abd5fa0b7cd..fb5c13a9b76e8f00720d5709f814c30630ac5237 100644 --- a/course_files/latent-spaces.Rmd +++ b/course_files/latent-spaces.Rmd @@ -3,10 +3,15 @@ output: html_document --- ```{r setup, echo=FALSE} -knitr::opts_chunk$set(fig.align = "center") +knitr::opts_chunk$set(fig.align = "center", eval = TRUE, warning=FALSE, message=FALSE) knitr::opts_knit$set(root.dir = normalizePath("..")) ``` +# Latent spaces + +In many cases we may like to think of cells sitting in a low-dimensional, +"latent" space that captures relationships between cells more intuitively than +the very high-dimensional gene expression space. ```{r library, echo=TRUE} library(scater) @@ -17,11 +22,6 @@ library(Polychrome) library(slalom) ``` -# Latent spaces - -In many cases we may like to think of cells sitting in a low-dimensional, -"latent" space that captures relationships between cells more intuitively than -the very high-dimensional gene expression space. ## Dimensionality reduction @@ -107,7 +107,9 @@ plotPCA(deng, colour_by = "cell_type2") + non-linear dependencies. For instance, PCA would not be able to “unroll†the following structure.\ <center> {width=30%} </center> -#### GLM-PCA +#### [GLM-PCA](https://rdrr.io/cran/glmpca/) +[@collins2002generalization] +[@townes2019feature] GLM-PCA is a generalized version of the traditional PCA. @@ -217,8 +219,7 @@ ggplot(pd, aes(x=dim1, y=dim2, shape=clust, colour=batch)) + ### tSNE: t-Distributed Stochastic Neighbor Embedding -t-SNE is an advanced version of the original SNE algorithm. <font color="red"> -[ref] </font> +t-SNE [@maaten2008visualizing] is an advanced version of the original SNE algorithm. [@hinton2003stochastic] #### Motivation @@ -321,7 +322,7 @@ Therefore can merely be used for visualization.\ ### Manifold methods -#### UMAP: Uniform Manifold Approximation and Projection +#### UMAP: Uniform Manifold Approximation and Projection [@mcinnes2018umap] ##### __Advantages of UMAP over t-SNE:__ @@ -368,7 +369,7 @@ plotUMAP(muraro, colour_by="cell_type1") -#### PHATE +#### PHATE [@moon2017phate] ##### Sketch of algorithm @@ -433,22 +434,66 @@ ggplot(dt, aes(x=PHATE1, y=PHATE2, color=clust)) + ## Matrix factorization and factor analysis -Factor Analysis is similar to PCA in that, -they both aim to obtain a new set of distinct summary variables, -which are fewer in number than the original number of variables. - -The key concept of factor analysis is that the original, observed variables are +__The key concept of factor analysis__: The original, observed variables are correlated because they are all associated with some unobservable variables, -called latent factors. +the __latent factors__. -The variance of a variable can be splitted into two parts: \ +It looks similar to PCA, but instead of dimensionality reduction, factor analysis +focuses on studying the latent factors. + +The variance of an observed variable can be splitted into two parts: \ - Common variance: the part of variance that is explained by latent factors; \ -- Unique variance: the part that is specific to only one variable, usually considered as an error component or residual. +- Unique variance: the part that is specific to only one variable, usually considered as an error component or __residual__. + +The __factor loadings__ or weights indicate how much each latent factor is affecting the observed features. -<center> {width=80%} </center> +<center> {width=60%} </center> ### [Slalom](https://bioconductor.org/packages/release/bioc/html/slalom.html): Interpretable latent spaces +Highlight of Slalom: [@buettner2017f] + +- It incorporates prior information to help the model estimation; + +- It learns whatever not provided by prior knowledge in the model training process; + +- It enforces sparsity in the weight matrix. + +#### Methodology + +__Matrix expression of factor analysis:__ + +<center>{width=80%} </center> + +__How prior knowledge affects the model:__ + +<center> </center> + +- $I_{g, k}$: (observed) Indicator of whether a gene $g$ is annotated to a given pathway or factor $k$;\ +- $z_{g, k}$: (latent) Indicator of whether factor $k$ has a regulatory effect on gene $g$;\ +- $w_{g, k}$: (estimated) weights. + +__grey arrow__: +$$ P(I_{g, k}\vert z_{g, k}) = \begin{cases} +\text{Bernoulli}(p_1), \text{if } z_{g, k} = 1\\ +\text{Bernoulli}(p_2), \text{if } z_{g, k} = 0\\ +\end{cases}$$ + +__green arrow__: +$$ P(w_{g, k}\vert z_{g, k}) = \begin{cases} +N(w_{g, k}, 1/\alpha), \text{ if } z_{g, k} = 1\\ +\delta_0(w_{g, k}), \text{ if } z_{g, k} = 0\\ +\end{cases}$$ + +<center></center> + +We only look at the part of the __likelihood__ that is relavant to this part: +$\prod_{g} \prod_{k}P(I_{g, k}, w_{g, k}, z_{g, k})$, \ +where $P(I_{g, k}, w_{g, k}, z_{g, k}) = P(I_{g, k}, w_{g, k}| z_{g, k})P(z_{g,k}) += P( I_{g, k}| z_{g, k})P( w_{g, k}| z_{g, k})P(z_{g,k})$. +Since we do not know anything about $z_{g,k}$, it is assumed as Bernoulli(1/2). + +#### Example First, get a geneset in a `GeneSetCollection` object. ```{r} gmtfile <- system.file("extdata", "reactome_subset.gmt", package = "slalom") @@ -472,15 +517,16 @@ model_deng <- trainSlalom(model_deng, nIterations = 1000, seed = 100, tolerance View results:\ The `plotRelevance` function displays the most relevant terms (factors/pathways) ranked by relevance, showing gene set size and the number of genes gained/lost as active in the pathway as learnt by the model. -```{r, fig.width=10, fig.height=5} -plotRelevance(model_deng) +```{r, fig.width=14, fig.height=7} +plotRelevance(model_deng) + theme_classic(base_size = 8) ``` The `plotTerms` function shows the relevance of all terms in the model, enabling the identification of the most important pathways in the context of all that were included in the model. ```{r} plotTerms(model_deng) ``` -## Autoencoders +## Autoencoders +[@kingma2013auto] <center>{width=80%}</center> @@ -495,9 +541,9 @@ So we want the to find the parameters $\theta$ such that the probability to gene - __How do we define $Z$?__\ - -__The simpliest idea:__ $Z \sim N(0, 1)$. + - __The simplest idea:__ $Z \sim N(0, 1)$. It is not impossible, because "any distribution in d dimensions can be generated by taking a set of d variables that are normally distributed and mapping them through a sufficiently complicated function. "\ - -__A better idea:__ + -__A better idea:__ For most of $z$, $P(X|z; \theta)$ will be close to zero, meaning it contribute almost nothing to the estimate of $P(X)$. Thus, we want to sample only those values of $Z$ that are likely to produce $X$. Denote this distribution of $Z$ as $Q(Z|X)$ (it is infered and therefore depend on $X$).\ __Advantage:__ There will be a lot less possible values of $Z$ under $Q$ compared to random sampling, therefore, it will be easier to compute $E_{Z \sim Q} P(X|Z)$. diff --git a/course_files/pseudotime.Rmd b/course_files/pseudotime.Rmd index 6203df28f7859e012a13ad3623f69a960f316615..114945ae9e62072e407b8c5e098ca508d483551e 100644 --- a/course_files/pseudotime.Rmd +++ b/course_files/pseudotime.Rmd @@ -136,7 +136,7 @@ As the plot above shows, PC1 struggles to correctly order cells early and late i Can bespoke pseudotime methods do better than naive application of PCA? -### TSCAN +## TSCAN TSCAN [@tscam_rpkg] combines clustering with pseudotime analysis. First it clusters the cells using `mclust`, which is based on a mixture of normal distributions. Then it builds a minimum spanning tree to connect the clusters. The branch of this tree that connects the largest number of clusters is the main branch which is used to determine pseudotime. @@ -181,7 +181,7 @@ TSCAN gets the development trajectory the "wrong way around", in the sense that __Exercise 1__ Compare results for different numbers of clusters (`clusternum`). -### Slingshot +## Slingshot `Slingshot` [@Street2018-ac] is a single-cell lineage inference tool, it can work with datasets with multiple branches. Slingshot has two stages: 1) the inference of the global lineage structure using MST on clustered data points and 2) the inference of pseudotime variables for cells along each lineage by fitting simultaneous 'principal curves' across multiple lineages. @@ -261,7 +261,7 @@ _Comments_ Did you notice the ordering of clusters in the lineage prediced for ` After running slingshot, an interesting next step may be to find genes that change their expression over the course of development. We demonstrate one possible method for this type of analysis on the 100 most variable genes. We will regress each gene on the pseudotime variable we have generated, using a general additive model (GAM). This allows us to detect non-linear patterns in gene expression. -```{r gam_tm_deg,message=FASLE} +```{r gam_tm_deg, message=FALSE} library(gam) t <- deng_SCE$slingPseudotime_1 @@ -296,7 +296,7 @@ heatmap(heatdata, Colv = NA, We will regress each gene on the pseudotime variable we have generated, using a general additive model (GAM). This allows us to detect non-linear patterns in gene expression. -### Monocle +## Monocle The original `Monocle` [@Trapnell2014-os] method skips the clustering stage of TSCAN and directly builds a minimum spanning tree on a reduced dimension representation (using 'ICA') of the @@ -480,9 +480,9 @@ __Exercise 2__ Do you get a better resolution between the later time points by c __Exercise 3__ How does the ordering change if you only use the genes identified by M3Drop? -### Other methods +## Other methods -#### SLICER +### SLICER The SLICER[@Welch2016-jr] method is an algorithm for constructing trajectories that describe gene expression changes during a sequential biological @@ -584,7 +584,7 @@ the call to `conn_knn_graph`? __Exercise 5__ How does the ordering change if you use a different set of genes from those chosen by SLICER (e.g. the genes identified by M3Drop)? -#### Ouija +### Ouija Ouija (http://kieranrcampbell.github.io/ouija/) takes a different approach from the pseudotime estimation methods we have looked at so far. Earlier methods have all been "unsupervised", which is to say that apart from perhaps selecting informative genes we do not supply the method with any prior information about how we expect certain genes or the trajectory as a whole to behave. @@ -725,7 +725,7 @@ What conclusions can you draw from the gene regulation output from Ouija? If you have time, you might try the HMC inference method and see if that changes the Ouija results in any way. -### Comparison of the methods +## Comparison of the methods How do the trajectories inferred by TSCAN, Monocle, Diffusion Map, SLICER and Ouija compare? @@ -751,7 +751,7 @@ corrplot.mixed(cor(df_pseudotime, use = "na.or.complete"), We see here that Ouija, TSCAN and SLICER all give trajectories that are similar and strongly correlated with PC1. Diffusion Map is less strongly correlated with these methods, and Monocle gives very different results. -### Expression of genes through time +## Expression of genes through time Each package also enables the visualization of expression through pseudotime. Following individual genes is very helpful for identifying genes that play an important role in the differentiation process. We illustrate the procedure using the `Nanog` gene. @@ -807,9 +807,12 @@ plotExpression(deng_SCE, "Nanog", x = "pseudotime_ouija", show_smooth = TRUE) ``` -How many of these methods outperform the naive approach of using the first principal component to represent pseudotime for these data? +**Q:** How many of these methods outperform the naive approach of using the first +principal component to represent pseudotime for these data? -__Exercise 7__: Repeat the exercise using a subset of the genes, e.g. the set of highly variable genes that can be obtained using `Brennecke_getVariableGenes()` +__Exercise 7__: Repeat the exercise using a subset of the genes, e.g. the set of +highly variable genes that can be obtained using one of the methods discussed in +the Feature Selection chapter. ### dynverse diff --git a/course_files/remove-conf.Rmd b/course_files/remove-conf.Rmd index 69eccade9530bbd936dafcfb63c7de393fb1f446..c4df6303206b75eabfdc367895c5b06894abe49c 100644 --- a/course_files/remove-conf.Rmd +++ b/course_files/remove-conf.Rmd @@ -3,11 +3,11 @@ output: html_document --- ```{r setup, echo=FALSE} -knitr::opts_chunk$set(out.width='90%', fig.align = 'center', eval=TRUE) +knitr::opts_chunk$set(out.width='90%', fig.align = 'center', eval=TRUE, warning=FALSE, message=FALSE) knitr::opts_knit$set(root.dir = normalizePath("..")) ``` -## Dealing with confounders +## Batch effects ### Introduction @@ -125,7 +125,7 @@ residuals (this may be reasonable for normalized log-counts in many cases; but it may not be---debate continues in the literature), then we can apply `limma` to regress out (known) unwanted sources of variation as follows. -```{r limma-lm} +```{r limma-lm, eval=FALSE} ## fit a model just accounting for batch lm_design_batch <- model.matrix(~0 + batch, data = colData(umi.qc)) fit_lm_batch <- lmFit(logcounts(umi.qc), lm_design_batch) @@ -160,24 +160,24 @@ __Exercise 2__ Perform LM correction for each individual separately. Store the final corrected matrix in the `lm_batch_indi` slot. -```{r limma-lm-indi, echo=TRUE} +```{r limma-lm-indi, echo=TRUE, eval=FALSE} ## define cellular detection rate (cdr), i.e. proportion of genes expressed in each cell umi.qc$cdr <- umi.qc$total_features_by_counts_endogenous / nrow(umi.qc) ## fit a model just accounting for batch by individual lm_design_batch1 <- model.matrix(~batch + cdr, - data = coldata(umi.qc)[umi.qc$individual == "na19098",]) + data = colData(umi.qc)[umi.qc$individual == "na19098",]) fit_indi1 <- lmfit(logcounts(umi.qc)[, umi.qc$individual == "na19098"], lm_design_batch1) fit_indi1$coefficients[,1] <- 0 ## replace intercept with 0 to preserve reference batch resids_lm_batch1 <- residuals(fit_indi1, logcounts(umi.qc)[, umi.qc$individual == "na19098"]) lm_design_batch2 <- model.matrix(~batch + cdr, - data = coldata(umi.qc)[umi.qc$individual == "na19101",]) + data = colData(umi.qc)[umi.qc$individual == "na19101",]) fit_indi2 <- lmfit(logcounts(umi.qc)[, umi.qc$individual == "na19101"], lm_design_batch2) fit_indi2$coefficients[,1] <- 0 ## replace intercept with 0 to preserve reference batch resids_lm_batch2 <- residuals(fit_indi2, logcounts(umi.qc)[, umi.qc$individual == "na19101"]) lm_design_batch3 <- model.matrix(~batch + cdr, - data = coldata(umi.qc)[umi.qc$individual == "na19239",]) + data = colData(umi.qc)[umi.qc$individual == "na19239",]) fit_indi3 <- lmfit(logcounts(umi.qc)[, umi.qc$individual == "na19239"], lm_design_batch3) fit_indi3$coefficients[,1] <- 0 ## replace intercept with 0 to preserve reference batch resids_lm_batch3 <- residuals(fit_indi3, logcounts(umi.qc)[, umi.qc$individual == "na19239"]) @@ -200,6 +200,14 @@ What do you think of the results of this approach? #### Negative binomial generalized linear models +__Advanced exercise__ + +Can you use the `edgeR` package to use a negative binomial generalized linear +model to regress out batch effects? + +_Hint_: follow a similar approach to that taken in the `limma` example above. +You will need to use the `DGEList()`, `estimateDisp()`, and `glmQLFit()` +functions. ### sctransform @@ -216,7 +224,7 @@ effects without removing differences between individuals. However, here we will demonstrate how you *would* try to remove batch effects with `sctransform` for a kinder experimental design. -```{r sctransform-apply} +```{r sctransform-apply, results='hide'} umi_sparse <- as(counts(umi.qc), "dgCMatrix") ### Genes expressed in at least 5 cells will be kept sctnorm_data <- sctransform::vst(umi = umi_sparse, min_cells = 1, @@ -273,10 +281,12 @@ $W$, $\alpha$, $\beta$, and $k$ is infeasible. For a given $k$, instead the following three approaches to estimate the factors of unwanted variation $W$ are used: -* _RUVg_ uses negative control genes (e.g. ERCCs), assumed to have constant expression across samples; -* _RUVs_ uses centered (technical) replicate/negative control samples for which the covariates of interest are -constant; -* _RUVr_ uses residuals, e.g., from a first-pass GLM regression of the counts on the covariates of interest. +* _RUVg_ uses negative control genes (e.g. ERCCs), assumed to have constant + expression across samples; +* _RUVs_ uses centered (technical) replicate/negative control samples for which +the covariates of interest are constant; +* _RUVr_ uses residuals, e.g., from a first-pass GLM regression of the counts on + the covariates of interest. We will concentrate on the first two approaches. diff --git a/public/advanced-exercises.html b/public/advanced-exercises.html index 040dd7334b21527b35d8d8cc54e0358094de04cb..51e1b1158b7162c9adc8945378309f80635b7f26 100644 --- a/public/advanced-exercises.html +++ b/public/advanced-exercises.html @@ -339,7 +339,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.4.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-3"><i class="fa fa-check"></i><b>7.4.6</b> sessionInfo()</a></li> </ul></li> <li class="chapter" data-level="7.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#identifying-confounding-factors-reads"><i class="fa fa-check"></i><b>7.5</b> Identifying confounding factors (Reads)</a></li> -<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#dealing-with-confounders"><i class="fa fa-check"></i><b>7.6</b> Dealing with confounders</a><ul> +<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#batch-effects"><i class="fa fa-check"></i><b>7.6</b> Batch effects</a><ul> <li class="chapter" data-level="7.6.1" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#introduction-6"><i class="fa fa-check"></i><b>7.6.1</b> Introduction</a></li> <li class="chapter" data-level="7.6.2" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#linear-models"><i class="fa fa-check"></i><b>7.6.2</b> Linear models</a></li> <li class="chapter" data-level="7.6.3" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sctransform-2"><i class="fa fa-check"></i><b>7.6.3</b> sctransform</a></li> @@ -347,7 +347,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.6.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#combat"><i class="fa fa-check"></i><b>7.6.5</b> Combat</a></li> <li class="chapter" data-level="7.6.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#mnncorrect"><i class="fa fa-check"></i><b>7.6.6</b> mnnCorrect</a></li> <li class="chapter" data-level="7.6.7" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#harmony"><i class="fa fa-check"></i><b>7.6.7</b> Harmony</a></li> -<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-confounder-removal-strategies"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare confounder removal strategies</a></li> +<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-batch-correction"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare batch correction</a></li> <li class="chapter" data-level="7.6.9" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#big-exercise-2"><i class="fa fa-check"></i><b>7.6.9</b> Big Exercise</a></li> <li class="chapter" data-level="7.6.10" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-4"><i class="fa fa-check"></i><b>7.6.10</b> sessionInfo()</a></li> </ul></li> @@ -370,14 +370,13 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="9.1.2" data-path="latent-spaces.html"><a href="latent-spaces.html#tsne-t-distributed-stochastic-neighbor-embedding"><i class="fa fa-check"></i><b>9.1.2</b> tSNE: t-Distributed Stochastic Neighbor Embedding</a></li> <li class="chapter" data-level="9.1.3" data-path="latent-spaces.html"><a href="latent-spaces.html#manifold-methods"><i class="fa fa-check"></i><b>9.1.3</b> Manifold methods</a></li> </ul></li> -<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a></li> +<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a><ul> +<li class="chapter" data-level="9.2.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom-interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.2.1</b> <span>Slalom</span>: Interpretable latent spaces</a></li> +</ul></li> <li class="chapter" data-level="9.3" data-path="latent-spaces.html"><a href="latent-spaces.html#autoencoders"><i class="fa fa-check"></i><b>9.3</b> Autoencoders</a><ul> <li class="chapter" data-level="9.3.1" data-path="latent-spaces.html"><a href="latent-spaces.html#background-and-some-notations"><i class="fa fa-check"></i><b>9.3.1</b> Background and some notations</a></li> <li class="chapter" data-level="9.3.2" data-path="latent-spaces.html"><a href="latent-spaces.html#objective"><i class="fa fa-check"></i><b>9.3.2</b> Objective</a></li> </ul></li> -<li class="chapter" data-level="9.4" data-path="latent-spaces.html"><a href="latent-spaces.html#interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.4</b> Interpretable latent spaces</a><ul> -<li class="chapter" data-level="9.4.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom"><i class="fa fa-check"></i><b>9.4.1</b> Slalom</a></li> -</ul></li> </ul></li> <li class="chapter" data-level="10" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html"><i class="fa fa-check"></i><b>10</b> Clustering and cell annotation</a><ul> <li class="chapter" data-level="10.1" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html#clustering-methods"><i class="fa fa-check"></i><b>10.1</b> Clustering Methods</a><ul> @@ -399,15 +398,16 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="11.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#first-look-at-deng-data"><i class="fa fa-check"></i><b>11.1</b> First look at Deng data</a><ul> <li class="chapter" data-level="11.1.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#tscan"><i class="fa fa-check"></i><b>11.1.1</b> TSCAN</a></li> <li class="chapter" data-level="11.1.2" data-path="trajectory-inference.html"><a href="trajectory-inference.html#slingshot"><i class="fa fa-check"></i><b>11.1.2</b> Slingshot</a></li> -<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.3</b> Monocle</a></li> -<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.4</b> Monocle 2</a></li> -<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 3</a></li> -<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.6</b> Diffusion maps</a></li> -<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.7</b> Other methods</a></li> -<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.8</b> Comparison of the methods</a></li> -<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.9</b> Expression of genes through time</a></li> -<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.10</b> dynverse</a></li> -<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.11</b> sessionInfo()</a></li> +<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#gam-general-additive-model-for-identifying-temporally-expressed-genes"><i class="fa fa-check"></i><b>11.1.3</b> GAM general additive model for identifying temporally expressed genes</a></li> +<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.4</b> Monocle</a></li> +<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 2</a></li> +<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.6</b> Monocle 3</a></li> +<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.7</b> Diffusion maps</a></li> +<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.8</b> Other methods</a></li> +<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.9</b> Comparison of the methods</a></li> +<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.10</b> Expression of genes through time</a></li> +<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.11</b> dynverse</a></li> +<li class="chapter" data-level="11.1.12" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.12</b> sessionInfo()</a></li> </ul></li> </ul></li> <li class="chapter" data-level="12" data-path="dechapter.html"><a href="dechapter.html"><i class="fa fa-check"></i><b>12</b> Differential Expression (DE) analysis</a><ul> @@ -547,20 +547,20 @@ the Salmon index that was used for the quantification).</p> <p>Here we will show you how to create an <code>SCE</code> from a <code>MultiAssayExperiment</code> object. For example, if you download <code>Shalek2013</code> dataset you will be able to create an <code>SCE</code> using the following code:</p> -<div class="sourceCode" id="cb821"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb821-1" data-line-number="1"><span class="kw">library</span>(MultiAssayExperiment)</a> -<a class="sourceLine" id="cb821-2" data-line-number="2"><span class="kw">library</span>(SummarizedExperiment)</a> -<a class="sourceLine" id="cb821-3" data-line-number="3"><span class="kw">library</span>(scater)</a> -<a class="sourceLine" id="cb821-4" data-line-number="4">d <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"~/Desktop/GSE41265.rds"</span>)</a> -<a class="sourceLine" id="cb821-5" data-line-number="5">cts <-<span class="st"> </span><span class="kw">assays</span>(<span class="kw">experiments</span>(d)[[<span class="st">"gene"</span>]])[[<span class="st">"count_lstpm"</span>]]</a> -<a class="sourceLine" id="cb821-6" data-line-number="6">tpms <-<span class="st"> </span><span class="kw">assays</span>(<span class="kw">experiments</span>(d)[[<span class="st">"gene"</span>]])[[<span class="st">"TPM"</span>]]</a> -<a class="sourceLine" id="cb821-7" data-line-number="7">phn <-<span class="st"> </span><span class="kw">colData</span>(d)</a> -<a class="sourceLine" id="cb821-8" data-line-number="8">sce <-<span class="st"> </span><span class="kw">SingleCellExperiment</span>(</a> -<a class="sourceLine" id="cb821-9" data-line-number="9"> <span class="dt">assays =</span> <span class="kw">list</span>(</a> -<a class="sourceLine" id="cb821-10" data-line-number="10"> <span class="dt">countData =</span> cts, </a> -<a class="sourceLine" id="cb821-11" data-line-number="11"> <span class="dt">tpmData =</span> tpms</a> -<a class="sourceLine" id="cb821-12" data-line-number="12"> ),</a> -<a class="sourceLine" id="cb821-13" data-line-number="13"> <span class="dt">colData =</span> phn</a> -<a class="sourceLine" id="cb821-14" data-line-number="14">)</a></code></pre></div> +<div class="sourceCode" id="cb904"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb904-1" data-line-number="1"><span class="kw">library</span>(MultiAssayExperiment)</a> +<a class="sourceLine" id="cb904-2" data-line-number="2"><span class="kw">library</span>(SummarizedExperiment)</a> +<a class="sourceLine" id="cb904-3" data-line-number="3"><span class="kw">library</span>(scater)</a> +<a class="sourceLine" id="cb904-4" data-line-number="4">d <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"~/Desktop/GSE41265.rds"</span>)</a> +<a class="sourceLine" id="cb904-5" data-line-number="5">cts <-<span class="st"> </span><span class="kw">assays</span>(<span class="kw">experiments</span>(d)[[<span class="st">"gene"</span>]])[[<span class="st">"count_lstpm"</span>]]</a> +<a class="sourceLine" id="cb904-6" data-line-number="6">tpms <-<span class="st"> </span><span class="kw">assays</span>(<span class="kw">experiments</span>(d)[[<span class="st">"gene"</span>]])[[<span class="st">"TPM"</span>]]</a> +<a class="sourceLine" id="cb904-7" data-line-number="7">phn <-<span class="st"> </span><span class="kw">colData</span>(d)</a> +<a class="sourceLine" id="cb904-8" data-line-number="8">sce <-<span class="st"> </span><span class="kw">SingleCellExperiment</span>(</a> +<a class="sourceLine" id="cb904-9" data-line-number="9"> <span class="dt">assays =</span> <span class="kw">list</span>(</a> +<a class="sourceLine" id="cb904-10" data-line-number="10"> <span class="dt">countData =</span> cts, </a> +<a class="sourceLine" id="cb904-11" data-line-number="11"> <span class="dt">tpmData =</span> tpms</a> +<a class="sourceLine" id="cb904-12" data-line-number="12"> ),</a> +<a class="sourceLine" id="cb904-13" data-line-number="13"> <span class="dt">colData =</span> phn</a> +<a class="sourceLine" id="cb904-14" data-line-number="14">)</a></code></pre></div> <p>You can also see that several different QC metrics have already been pre-calculated on the <a href="http://imlspenticton.uzh.ch:3838/conquer/">conquer</a> website.</p> diff --git a/public/cell-calling.md b/public/cell-calling.md index c33d699417f1d6955d264ede52a1616c27618a22..81dabc2b022d57a11e3be3c471a75d7152b86c63 100644 --- a/public/cell-calling.md +++ b/public/cell-calling.md @@ -46,6 +46,8 @@ barcode_rank <- rank(-umi_per_barcode[,2]) plot(barcode_rank, umi_per_barcode[,2], xlim=c(1,8000)) ``` +<img src="cell-calling_files/figure-html/unnamed-chunk-4-1.png" width="90%" style="display: block; margin: auto;" /> + Here we can see an roughly exponential curve of library sizes, so to make things simpler lets log-transform them. @@ -55,6 +57,8 @@ log_lib_size <- log10(umi_per_barcode[,2]) plot(barcode_rank, log_lib_size, xlim=c(1,8000)) ``` +<img src="cell-calling_files/figure-html/unnamed-chunk-5-1.png" width="90%" style="display: block; margin: auto;" /> + That's better, the "knee" in the distribution is much more pronounced. We could manually estimate where the "knee" is but it much more reproducible to algorithmically identify this point. @@ -71,7 +75,11 @@ inflection <- which(rawdiff == min(rawdiff[100:length(rawdiff)], na.rm=TRUE)) plot(barcode_rank, log_lib_size, xlim=c(1,8000)) abline(v=inflection, col="red", lwd=2) +``` + +<img src="cell-calling_files/figure-html/unnamed-chunk-6-1.png" width="90%" style="display: block; margin: auto;" /> +```r threshold <- 10^log_lib_size[inflection] cells <- umi_per_barcode[umi_per_barcode[,2] > threshold,1] @@ -80,6 +88,10 @@ Recall <- sum(cells %in% truth[,1])/length(truth[,1]) c(TPR, Recall) ``` +``` +## [1] 1.0000000 0.7831707 +``` + ### Mixture model Another is to fix a mixture model and find where the higher and lower distributions intersect. However, data may not fit the assumed distributions very well: @@ -89,8 +101,32 @@ Another is to fix a mixture model and find where the higher and lower distributi set.seed(-92497) # mixture model require("mixtools") +``` + +``` +## Loading required package: mixtools +``` + +``` +## mixtools package, version 1.1.0, Released 2017-03-10 +## This package is based upon work supported by the National Science Foundation under Grant No. SES-0518772. +``` + +```r mix <- normalmixEM(log_lib_size) +``` + +``` +## number of iterations= 43 +``` + +```r plot(mix, which=2, xlab2="log(mol per cell)") +``` + +<img src="cell-calling_files/figure-html/unnamed-chunk-7-1.png" width="90%" style="display: block; margin: auto;" /> + +```r p1 <- dnorm(log_lib_size, mean=mix$mu[1], sd=mix$sigma[1]) p2 <- dnorm(log_lib_size, mean=mix$mu[2], sd=mix$sigma[2]) if (mix$mu[1] < mix$mu[2]) { @@ -122,6 +158,8 @@ thresh = totals[round(0.01*n_cells)]/10 plot(totals, xlim=c(1,8000)) abline(h=thresh, col="red", lwd=2) ``` + +<img src="cell-calling_files/figure-html/unnamed-chunk-9-1.png" width="90%" style="display: block; margin: auto;" /> __Exercise__ Identify cells using this threshodl and calculate the TPR and Recall. diff --git a/public/cell-calling_files/figure-html/unnamed-chunk-4-1.png b/public/cell-calling_files/figure-html/unnamed-chunk-4-1.png new file mode 100644 index 0000000000000000000000000000000000000000..b0318cc166af14f71f8eeba58d44bf54a074efc0 Binary files /dev/null and b/public/cell-calling_files/figure-html/unnamed-chunk-4-1.png differ diff --git a/public/cell-calling_files/figure-html/unnamed-chunk-5-1.png b/public/cell-calling_files/figure-html/unnamed-chunk-5-1.png new file mode 100644 index 0000000000000000000000000000000000000000..ee8ef0c8828df115513c0baee032e78973e78e9e Binary files /dev/null and b/public/cell-calling_files/figure-html/unnamed-chunk-5-1.png differ diff --git a/public/cell-calling_files/figure-html/unnamed-chunk-6-1.png b/public/cell-calling_files/figure-html/unnamed-chunk-6-1.png new file mode 100644 index 0000000000000000000000000000000000000000..4cf0faac95b2143eb68784a03a3106292c8d3b9c Binary files /dev/null and b/public/cell-calling_files/figure-html/unnamed-chunk-6-1.png differ diff --git a/public/cell-calling_files/figure-html/unnamed-chunk-7-1.png b/public/cell-calling_files/figure-html/unnamed-chunk-7-1.png new file mode 100644 index 0000000000000000000000000000000000000000..e545177d586b0f9a3790101d751cc28dfdaa4dee Binary files /dev/null and b/public/cell-calling_files/figure-html/unnamed-chunk-7-1.png differ diff --git a/public/cell-calling_files/figure-html/unnamed-chunk-9-1.png b/public/cell-calling_files/figure-html/unnamed-chunk-9-1.png new file mode 100644 index 0000000000000000000000000000000000000000..65ccad7e63cf27dc57842414b2d148e5dea6c5d9 Binary files /dev/null and b/public/cell-calling_files/figure-html/unnamed-chunk-9-1.png differ diff --git a/public/clust-intro.md b/public/clust-intro.md index 6f02bc7a3e080c215682836b0233b8ef9ad35111..8b0b279a2498f8fb422c53cf9f00b700b9ae1645 100644 --- a/public/clust-intro.md +++ b/public/clust-intro.md @@ -105,7 +105,7 @@ to scRNA-seq data by building a graph where each vertice represents a cell and (weight of) the edge measures similarity between two cells. Actually, graph-based clustering is the most popular clustering algorithm in scRNA-seq data analysis, and has been reported to have outperformed other -clustering methods in many situations (ref). +clustering methods in many situations [@freytag2018comparison]. ##### Why do we want to represent the data as a graph?\ @@ -120,12 +120,12 @@ clustering methods in many situations (ref). - __Step2__: Add weights, and obtain a shared nearest neighbour (__SNN__) graph -<center>{width= 4%}</center> +<center>{width=40%}</center> There are two ways of adding weights: number and rank.\ - _number_: The number of shared nodes between $u$ and $v$, in this case, 3. \ -- _rank_: A measurement of the closeness to their common nearest neighbours. (ref) \ +- _rank_: A measurement of the closeness to their common nearest neighbours. (@xu2015identification) \ <font color="#bf812d"> @@ -142,28 +142,37 @@ $$ w(u, v) = K - s(u, v).$$ ##### Quality function (Modularity)\ -Modularity is not the only quality function for graph-based clustering, +Modularity [@newman2004finding] is not the only quality function for graph-based clustering, but it is one of the first attempts to embed in a compact form many questions including -<font color="red"> ... </font>.\ +the definition of quality function and null model etc.\ __The idea of modularity__: A random graph should not have a cluster structure. \ The more "quality" a partition has compared to a random graph, the "better" the partition is.\ Specifically, it is defined by: the <font color="#bf812d"> quality </font> of a partition on the actual graph $-$ the quality of the same partition on a <font color="#bf812d"> random graph </font> - + <font color="#bf812d"> quality </font>: Sum of the weights within clusters \ <font color="#bf812d"> random graph </font>: a copy of the original graph, with some of its properties, but without community structure. The random graph defined by modularity is: each node has the same degree as the original graph. - $$ Q \propto \sum_{i, j} A_{i, j} \delta(i, j) - \sum_{i, j} \dfrac{k_i k_j}{2m} \delta(i, j)$$ -<font color="red"> [notations] </font> + $$ Q \propto \sum_{i, j} A_{i, j} \delta(i, j) - \sum_{i, j} \dfrac{k_i k_j}{2m} \delta(i, j)$$ + +- $A_{i, j}$: weight between node $i$ and $j$; + +- $\delta(i, j)$: indicator of whether $i$ and $j$ are in the same cluster; + +- $k_i$: the degree of node $i$ (the sum of weights of all edges connected to $i$); + +- $m$: the total weight in the all graph. + + __Higher modularity implies better partition__: <center>{width=80%}</center> -__Limits of modularity__: \ +__Limits of modularity__: [@good2010performance]\ 1. Resolution limit. \ Short version: Modularity maximization forces small communities into larger ones. \ Longer version: For two clusters $A$ and $B$, if $k_A k_B < 2m$ then modularity increases by merging A and B into a single cluster, even if A and B are distinct clusters.\ @@ -179,12 +188,13 @@ __Limits of modularity__: \ Modularity-based clustering methods implemented in single cell analysis are mostly greedy algorithms, that are very fast, although not the most accurate approaches. - __Louvain__: + __Louvain__: [@blondel2008fast] <center>{width=80%}</center> - __Leiden__: Improved Louvain, hybrid of greedy algorithm and sampling technique \ + __Leiden__:[@traag2019louvain] \ + Improved Louvain, hybrid of greedy algorithm and sampling technique \ ##### __Advantages__: \ -Fast \ diff --git a/public/clustering-and-cell-annotation.html b/public/clustering-and-cell-annotation.html index e6339c3ece12ff78fc29cfafce2f4a35969612eb..cb4f950d603a078b6770d87204f247d84e29b45c 100644 --- a/public/clustering-and-cell-annotation.html +++ b/public/clustering-and-cell-annotation.html @@ -339,7 +339,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.4.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-3"><i class="fa fa-check"></i><b>7.4.6</b> sessionInfo()</a></li> </ul></li> <li class="chapter" data-level="7.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#identifying-confounding-factors-reads"><i class="fa fa-check"></i><b>7.5</b> Identifying confounding factors (Reads)</a></li> -<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#dealing-with-confounders"><i class="fa fa-check"></i><b>7.6</b> Dealing with confounders</a><ul> +<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#batch-effects"><i class="fa fa-check"></i><b>7.6</b> Batch effects</a><ul> <li class="chapter" data-level="7.6.1" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#introduction-6"><i class="fa fa-check"></i><b>7.6.1</b> Introduction</a></li> <li class="chapter" data-level="7.6.2" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#linear-models"><i class="fa fa-check"></i><b>7.6.2</b> Linear models</a></li> <li class="chapter" data-level="7.6.3" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sctransform-2"><i class="fa fa-check"></i><b>7.6.3</b> sctransform</a></li> @@ -347,7 +347,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.6.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#combat"><i class="fa fa-check"></i><b>7.6.5</b> Combat</a></li> <li class="chapter" data-level="7.6.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#mnncorrect"><i class="fa fa-check"></i><b>7.6.6</b> mnnCorrect</a></li> <li class="chapter" data-level="7.6.7" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#harmony"><i class="fa fa-check"></i><b>7.6.7</b> Harmony</a></li> -<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-confounder-removal-strategies"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare confounder removal strategies</a></li> +<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-batch-correction"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare batch correction</a></li> <li class="chapter" data-level="7.6.9" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#big-exercise-2"><i class="fa fa-check"></i><b>7.6.9</b> Big Exercise</a></li> <li class="chapter" data-level="7.6.10" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-4"><i class="fa fa-check"></i><b>7.6.10</b> sessionInfo()</a></li> </ul></li> @@ -370,14 +370,13 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="9.1.2" data-path="latent-spaces.html"><a href="latent-spaces.html#tsne-t-distributed-stochastic-neighbor-embedding"><i class="fa fa-check"></i><b>9.1.2</b> tSNE: t-Distributed Stochastic Neighbor Embedding</a></li> <li class="chapter" data-level="9.1.3" data-path="latent-spaces.html"><a href="latent-spaces.html#manifold-methods"><i class="fa fa-check"></i><b>9.1.3</b> Manifold methods</a></li> </ul></li> -<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a></li> +<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a><ul> +<li class="chapter" data-level="9.2.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom-interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.2.1</b> <span>Slalom</span>: Interpretable latent spaces</a></li> +</ul></li> <li class="chapter" data-level="9.3" data-path="latent-spaces.html"><a href="latent-spaces.html#autoencoders"><i class="fa fa-check"></i><b>9.3</b> Autoencoders</a><ul> <li class="chapter" data-level="9.3.1" data-path="latent-spaces.html"><a href="latent-spaces.html#background-and-some-notations"><i class="fa fa-check"></i><b>9.3.1</b> Background and some notations</a></li> <li class="chapter" data-level="9.3.2" data-path="latent-spaces.html"><a href="latent-spaces.html#objective"><i class="fa fa-check"></i><b>9.3.2</b> Objective</a></li> </ul></li> -<li class="chapter" data-level="9.4" data-path="latent-spaces.html"><a href="latent-spaces.html#interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.4</b> Interpretable latent spaces</a><ul> -<li class="chapter" data-level="9.4.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom"><i class="fa fa-check"></i><b>9.4.1</b> Slalom</a></li> -</ul></li> </ul></li> <li class="chapter" data-level="10" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html"><i class="fa fa-check"></i><b>10</b> Clustering and cell annotation</a><ul> <li class="chapter" data-level="10.1" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html#clustering-methods"><i class="fa fa-check"></i><b>10.1</b> Clustering Methods</a><ul> @@ -399,15 +398,16 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="11.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#first-look-at-deng-data"><i class="fa fa-check"></i><b>11.1</b> First look at Deng data</a><ul> <li class="chapter" data-level="11.1.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#tscan"><i class="fa fa-check"></i><b>11.1.1</b> TSCAN</a></li> <li class="chapter" data-level="11.1.2" data-path="trajectory-inference.html"><a href="trajectory-inference.html#slingshot"><i class="fa fa-check"></i><b>11.1.2</b> Slingshot</a></li> -<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.3</b> Monocle</a></li> -<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.4</b> Monocle 2</a></li> -<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 3</a></li> -<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.6</b> Diffusion maps</a></li> -<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.7</b> Other methods</a></li> -<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.8</b> Comparison of the methods</a></li> -<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.9</b> Expression of genes through time</a></li> -<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.10</b> dynverse</a></li> -<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.11</b> sessionInfo()</a></li> +<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#gam-general-additive-model-for-identifying-temporally-expressed-genes"><i class="fa fa-check"></i><b>11.1.3</b> GAM general additive model for identifying temporally expressed genes</a></li> +<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.4</b> Monocle</a></li> +<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 2</a></li> +<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.6</b> Monocle 3</a></li> +<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.7</b> Diffusion maps</a></li> +<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.8</b> Other methods</a></li> +<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.9</b> Comparison of the methods</a></li> +<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.10</b> Expression of genes through time</a></li> +<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.11</b> dynverse</a></li> +<li class="chapter" data-level="11.1.12" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.12</b> sessionInfo()</a></li> </ul></li> </ul></li> <li class="chapter" data-level="12" data-path="dechapter.html"><a href="dechapter.html"><i class="fa fa-check"></i><b>12</b> Differential Expression (DE) analysis</a><ul> @@ -582,7 +582,7 @@ to scRNA-seq data by building a graph where each vertice represents a cell and (weight of) the edge measures similarity between two cells.<br /> Actually, graph-based clustering is the most popular clustering algorithm in scRNA-seq data analysis, and has been reported to have outperformed other -clustering methods in many situations (ref).</p> +clustering methods in many situations <span class="citation">(Freytag et al. <a href="#ref-freytag2018comparison">2018</a>)</span>.</p> <div id="why-do-we-want-to-represent-the-data-as-a-graph" class="section level5"> <h5><span class="header-section-number">10.1.2.3.1</span> Why do we want to represent the data as a graph?<br /> </h5> @@ -606,11 +606,11 @@ clustering methods in many situations (ref).</p> <li><strong>Step2</strong>: Add weights, and obtain a shared nearest neighbour (<strong>SNN</strong>) graph</li> </ul> <center> -<img src="figures/SNN.jpg" />{width= 4%} +<img src="figures/SNN.jpg" style="width:40.0%" /> </center> <p>There are two ways of adding weights: number and rank.<br /> - <em>number</em>: The number of shared nodes between <span class="math inline">\(u\)</span> and <span class="math inline">\(v\)</span>, in this case, 3.<br /> -- <em>rank</em>: A measurement of the closeness to their common nearest neighbours. (ref)<br /> +- <em>rank</em>: A measurement of the closeness to their common nearest neighbours. (<span class="citation">Xu and Su (<a href="#ref-xu2015identification">2015</a>)</span>)<br /> </p> <p><font color="#bf812d"><br /> <strong>Details of rank </strong>:<br /> @@ -627,9 +627,9 @@ The final expression of weight: <div id="quality-function-modularity" class="section level5"> <h5><span class="header-section-number">10.1.2.3.3</span> Quality function (Modularity)<br /> </h5> -<p>Modularity is not the only quality function for graph-based clustering, +<p>Modularity <span class="citation">(Newman and Girvan <a href="#ref-newman2004finding">2004</a>)</span> is not the only quality function for graph-based clustering, but it is one of the first attempts to embed in a compact form many questions including -<font color="red"> … </font>.<br /> +the definition of quality function and null model etc.<br /> </p> <p><strong>The idea of modularity</strong>: A random graph should not have a cluster structure.<br /> The more “quality†a partition has compared to a random graph, the “better†the partition is.<br /> @@ -637,13 +637,18 @@ Specifically, it is defined by:</p> <p>the <font color="#bf812d"> quality </font> of a partition on the actual graph <span class="math inline">\(-\)</span> the quality of the same partition on a <font color="#bf812d"> random graph </font></p> <p><font color="#bf812d"> quality </font>: Sum of the weights within clusters<br /> <font color="#bf812d"> random graph </font>: a copy of the original graph, with some of its properties, but without community structure. The random graph defined by modularity is: each node has the same degree as the original graph.</p> -<p><span class="math display">\[ Q \propto \sum_{i, j} A_{i, j} \delta(i, j) - \sum_{i, j} \dfrac{k_i k_j}{2m} \delta(i, j)\]</span> -<font color="red"> [notations] </font></p> +<p><span class="math display">\[ Q \propto \sum_{i, j} A_{i, j} \delta(i, j) - \sum_{i, j} \dfrac{k_i k_j}{2m} \delta(i, j)\]</span></p> +<ul> +<li><p><span class="math inline">\(A_{i, j}\)</span>: weight between node <span class="math inline">\(i\)</span> and <span class="math inline">\(j\)</span>;</p></li> +<li><p><span class="math inline">\(\delta(i, j)\)</span>: indicator of whether <span class="math inline">\(i\)</span> and <span class="math inline">\(j\)</span> are in the same cluster;</p></li> +<li><p><span class="math inline">\(k_i\)</span>: the degree of node <span class="math inline">\(i\)</span> (the sum of weights of all edges connected to <span class="math inline">\(i\)</span>);</p></li> +<li><p><span class="math inline">\(m\)</span>: the total weight in the all graph.</p></li> +</ul> <strong>Higher modularity implies better partition</strong>: <center> <img src="figures/modularity.jpg" style="width:80.0%" /> </center> -<strong>Limits of modularity</strong>:<br /> +<strong>Limits of modularity</strong>: <span class="citation">(Good, De Montjoye, and Clauset <a href="#ref-good2010performance">2010</a>)</span><br /> 1. Resolution limit.<br /> Short version: Modularity maximization forces small communities into larger ones.<br /> Longer version: For two clusters <span class="math inline">\(A\)</span> and <span class="math inline">\(B\)</span>, if <span class="math inline">\(k_A k_B < 2m\)</span> then modularity increases by merging A and B into a single cluster, even if A and B are distinct clusters.<br /> @@ -663,11 +668,12 @@ Longer version: For two clusters <span class="math inline">\(A\)</span> and <spa </h5> <p>Modularity-based clustering methods implemented in single cell analysis are mostly greedy algorithms, that are very fast, although not the most accurate approaches.</p> -<p>  <strong>Louvain</strong>:</p> +<p>  <strong>Louvain</strong>: <span class="citation">(Blondel et al. <a href="#ref-blondel2008fast">2008</a>)</span></p> <center> <img src="figures/Louvain.jpg" style="width:80.0%" /> </center> -<p>  <strong>Leiden</strong>: Improved Louvain, hybrid of greedy algorithm and sampling technique<br /> +<p>  <strong>Leiden</strong>:<span class="citation">(Traag, Waltman, and Eck <a href="#ref-traag2019louvain">2019</a>)</span><br /> +Improved Louvain, hybrid of greedy algorithm and sampling technique<br /> </p> </div> <div id="advantages" class="section level5"> @@ -749,14 +755,14 @@ Average of all the partitions </div> <div id="clust-methods" class="section level2"> <h2><span class="header-section-number">10.2</span> Clustering example</h2> -<div class="sourceCode" id="cb598"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb598-1" data-line-number="1"><span class="kw">library</span>(pcaMethods)</a> -<a class="sourceLine" id="cb598-2" data-line-number="2"><span class="kw">library</span>(SC3)</a> -<a class="sourceLine" id="cb598-3" data-line-number="3"><span class="kw">library</span>(scater)</a> -<a class="sourceLine" id="cb598-4" data-line-number="4"><span class="kw">library</span>(SingleCellExperiment)</a> -<a class="sourceLine" id="cb598-5" data-line-number="5"><span class="kw">library</span>(pheatmap)</a> -<a class="sourceLine" id="cb598-6" data-line-number="6"><span class="kw">library</span>(mclust)</a> -<a class="sourceLine" id="cb598-7" data-line-number="7"><span class="kw">library</span>(igraph)</a> -<a class="sourceLine" id="cb598-8" data-line-number="8"><span class="kw">library</span>(scran)</a></code></pre></div> +<div class="sourceCode" id="cb671"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb671-1" data-line-number="1"><span class="kw">library</span>(pcaMethods)</a> +<a class="sourceLine" id="cb671-2" data-line-number="2"><span class="kw">library</span>(SC3)</a> +<a class="sourceLine" id="cb671-3" data-line-number="3"><span class="kw">library</span>(scater)</a> +<a class="sourceLine" id="cb671-4" data-line-number="4"><span class="kw">library</span>(SingleCellExperiment)</a> +<a class="sourceLine" id="cb671-5" data-line-number="5"><span class="kw">library</span>(pheatmap)</a> +<a class="sourceLine" id="cb671-6" data-line-number="6"><span class="kw">library</span>(mclust)</a> +<a class="sourceLine" id="cb671-7" data-line-number="7"><span class="kw">library</span>(igraph)</a> +<a class="sourceLine" id="cb671-8" data-line-number="8"><span class="kw">library</span>(scran)</a></code></pre></div> <div id="example-1.-graph-based-clustering-deng-dataset" class="section level3"> <h3><span class="header-section-number">10.2.1</span> Example 1. Graph-based clustering (deng dataset)</h3> <p>To illustrate clustering of scRNA-seq data, we consider the <code>Deng</code> dataset of @@ -764,56 +770,60 @@ cells from developing mouse embryo <span class="citation">(Deng et al. <a href=" dataset and created a <code>SingleCellExperiment</code> object in advance. We have also annotated the cells with the cell types identified in the original publication (it is the <code>cell_type2</code> column in the <code>colData</code> slot).</p> -<div class="sourceCode" id="cb599"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb599-1" data-line-number="1">deng <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/deng/deng-reads.rds"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb672"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb672-1" data-line-number="1">deng <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/deng/deng-reads.rds"</span>)</a></code></pre></div> <p>First, we build a <span class="math inline">\(K\)</span>-NN graph with a package function from <a href="https://bioconductor.org/packages/release/bioc/html/scran.html">scran</a>. The most important decision of building a graph is the choice of <span class="math inline">\(K\)</span>, of which there is no standard rule. In general, we can think of it as an indication of the desired cluster size. If <span class="math inline">\(K\)</span> is too small, a genuine cluster might be split into parts, while if <span class="math inline">\(K\)</span> is too large, clusters might not thoroughly separated.</p> -<div class="sourceCode" id="cb600"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb600-1" data-line-number="1">deng5 <-<span class="st"> </span><span class="kw">buildSNNGraph</span>(deng, <span class="dt">k =</span> <span class="dv">5</span>)</a> -<a class="sourceLine" id="cb600-2" data-line-number="2">deng15 <-<span class="st"> </span><span class="kw">buildSNNGraph</span>(deng, <span class="dt">k =</span> <span class="dv">15</span>)</a> -<a class="sourceLine" id="cb600-3" data-line-number="3">deng25 <-<span class="st"> </span><span class="kw">buildSNNGraph</span>(deng, <span class="dt">k =</span> <span class="dv">25</span>)</a> -<a class="sourceLine" id="cb600-4" data-line-number="4"><span class="kw">par</span>(<span class="dt">mfrow=</span><span class="kw">c</span>(<span class="dv">1</span>,<span class="dv">3</span>))</a> -<a class="sourceLine" id="cb600-5" data-line-number="5"><span class="kw">plot</span>(deng5, <span class="dt">vertex.size =</span> <span class="dv">4</span>, <span class="dt">vertex.label =</span> <span class="ot">NA</span>)</a> -<a class="sourceLine" id="cb600-6" data-line-number="6"><span class="kw">title</span>(<span class="st">"5-NN"</span> ,<span class="dt">line =</span> <span class="dv">-33</span>, <span class="dt">cex.main =</span> <span class="dv">3</span>)</a> -<a class="sourceLine" id="cb600-7" data-line-number="7"><span class="kw">plot</span>(deng15, <span class="dt">vertex.size =</span> <span class="dv">4</span>, <span class="dt">vertex.label =</span> <span class="ot">NA</span>)</a> -<a class="sourceLine" id="cb600-8" data-line-number="8"><span class="kw">title</span>(<span class="st">"15-NN"</span> ,<span class="dt">line =</span> <span class="dv">-33</span>, <span class="dt">cex.main =</span> <span class="dv">3</span>)</a> -<a class="sourceLine" id="cb600-9" data-line-number="9"><span class="kw">plot</span>(deng25, <span class="dt">vertex.size =</span> <span class="dv">4</span>, <span class="dt">vertex.label =</span> <span class="ot">NA</span>)</a> -<a class="sourceLine" id="cb600-10" data-line-number="10"><span class="kw">title</span>(<span class="st">"25-NN"</span> ,<span class="dt">line =</span> <span class="dv">-33</span>, <span class="dt">cex.main =</span> <span class="dv">3</span>)</a></code></pre></div> +<div class="sourceCode" id="cb673"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb673-1" data-line-number="1">deng5 <-<span class="st"> </span><span class="kw">buildSNNGraph</span>(deng, <span class="dt">k =</span> <span class="dv">5</span>)</a> +<a class="sourceLine" id="cb673-2" data-line-number="2">deng15 <-<span class="st"> </span><span class="kw">buildSNNGraph</span>(deng, <span class="dt">k =</span> <span class="dv">15</span>)</a> +<a class="sourceLine" id="cb673-3" data-line-number="3">deng25 <-<span class="st"> </span><span class="kw">buildSNNGraph</span>(deng, <span class="dt">k =</span> <span class="dv">25</span>)</a> +<a class="sourceLine" id="cb673-4" data-line-number="4"><span class="kw">par</span>(<span class="dt">mfrow=</span><span class="kw">c</span>(<span class="dv">1</span>,<span class="dv">3</span>))</a> +<a class="sourceLine" id="cb673-5" data-line-number="5"><span class="kw">plot</span>(deng5, <span class="dt">vertex.size =</span> <span class="dv">4</span>, <span class="dt">vertex.label =</span> <span class="ot">NA</span>)</a> +<a class="sourceLine" id="cb673-6" data-line-number="6"><span class="kw">title</span>(<span class="st">"5-NN"</span> ,<span class="dt">line =</span> <span class="dv">-33</span>, <span class="dt">cex.main =</span> <span class="dv">3</span>)</a> +<a class="sourceLine" id="cb673-7" data-line-number="7"><span class="kw">plot</span>(deng15, <span class="dt">vertex.size =</span> <span class="dv">4</span>, <span class="dt">vertex.label =</span> <span class="ot">NA</span>)</a> +<a class="sourceLine" id="cb673-8" data-line-number="8"><span class="kw">title</span>(<span class="st">"15-NN"</span> ,<span class="dt">line =</span> <span class="dv">-33</span>, <span class="dt">cex.main =</span> <span class="dv">3</span>)</a> +<a class="sourceLine" id="cb673-9" data-line-number="9"><span class="kw">plot</span>(deng25, <span class="dt">vertex.size =</span> <span class="dv">4</span>, <span class="dt">vertex.label =</span> <span class="ot">NA</span>)</a> +<a class="sourceLine" id="cb673-10" data-line-number="10"><span class="kw">title</span>(<span class="st">"25-NN"</span> ,<span class="dt">line =</span> <span class="dv">-33</span>, <span class="dt">cex.main =</span> <span class="dv">3</span>)</a></code></pre></div> <p><img src="clustering_files/figure-html/unnamed-chunk-2-1.png" width="1152" style="display: block; margin: auto;" /></p> <p>Perform Louvain clustering:</p> -<div class="sourceCode" id="cb601"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb601-1" data-line-number="1">cl <-<span class="st"> </span>igraph<span class="op">::</span><span class="kw">cluster_louvain</span>(deng15)<span class="op">$</span>membership</a> -<a class="sourceLine" id="cb601-2" data-line-number="2"><span class="kw">colData</span>(deng)<span class="op">$</span>cl <-<span class="st"> </span><span class="kw">factor</span>(cl)</a> -<a class="sourceLine" id="cb601-3" data-line-number="3">mclust<span class="op">::</span><span class="kw">adjustedRandIndex</span>(<span class="kw">colData</span>(deng)<span class="op">$</span>cell_type1, <span class="kw">colData</span>(deng)<span class="op">$</span>cl)</a></code></pre></div> -<pre><code>## [1] 0.8248454</code></pre> +<div class="sourceCode" id="cb674"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb674-1" data-line-number="1">cl <-<span class="st"> </span>igraph<span class="op">::</span><span class="kw">cluster_louvain</span>(deng15)<span class="op">$</span>membership</a> +<a class="sourceLine" id="cb674-2" data-line-number="2"><span class="kw">colData</span>(deng)<span class="op">$</span>cl <-<span class="st"> </span><span class="kw">factor</span>(cl)</a> +<a class="sourceLine" id="cb674-3" data-line-number="3">mclust<span class="op">::</span><span class="kw">adjustedRandIndex</span>(<span class="kw">colData</span>(deng)<span class="op">$</span>cell_type2, <span class="kw">colData</span>(deng)<span class="op">$</span>cl)</a></code></pre></div> +<pre><code>## [1] 0.4197754</code></pre> <p>Reaches very high similarity with the labels provided in the original paper.</p> <p>However, it tend to merge small clusters into larger ones.</p> -<div class="sourceCode" id="cb603"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb603-1" data-line-number="1"><span class="kw">table</span>(deng<span class="op">$</span>cell_type1, cl)</a></code></pre></div> -<pre><code>## cl -## 1 2 3 -## 16cell 49 0 1 -## 2cell 0 22 0 -## 4cell 0 14 0 -## 8cell 36 0 1 -## blast 0 0 133 -## zygote 0 12 0</code></pre> +<div class="sourceCode" id="cb676"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb676-1" data-line-number="1"><span class="kw">table</span>(deng<span class="op">$</span>cell_type2, cl)</a></code></pre></div> +<pre><code>## cl +## 1 2 3 +## 16cell 49 0 1 +## 4cell 0 14 0 +## 8cell 36 0 1 +## early2cell 0 8 0 +## earlyblast 0 0 43 +## late2cell 0 10 0 +## lateblast 0 0 30 +## mid2cell 0 12 0 +## midblast 0 0 60 +## zy 0 4 0</code></pre> </div> <div id="example-2.-graph-based-clustering-segerstolpe-dataset" class="section level3"> <h3><span class="header-section-number">10.2.2</span> Example 2. Graph-based clustering (segerstolpe dataset)</h3> -<div class="sourceCode" id="cb605"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb605-1" data-line-number="1">muraro <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/pancreas/muraro.rds"</span>)</a> -<a class="sourceLine" id="cb605-2" data-line-number="2"><span class="co">## PCA</span></a> -<a class="sourceLine" id="cb605-3" data-line-number="3">var.fit <-<span class="st"> </span><span class="kw">suppressWarnings</span>(<span class="kw">trendVar</span>(muraro, <span class="dt">parametric=</span><span class="ot">TRUE</span>, <span class="dt">use.spikes=</span>F))</a> -<a class="sourceLine" id="cb605-4" data-line-number="4">muraro <-<span class="st"> </span><span class="kw">suppressWarnings</span>(<span class="kw">denoisePCA</span>(muraro, <span class="dt">technical=</span>var.fit<span class="op">$</span>trend))</a> -<a class="sourceLine" id="cb605-5" data-line-number="5"><span class="kw">dim</span>(<span class="kw">reducedDim</span>(muraro, <span class="st">"PCA"</span>))</a></code></pre></div> +<div class="sourceCode" id="cb678"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb678-1" data-line-number="1">muraro <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/pancreas/muraro.rds"</span>)</a> +<a class="sourceLine" id="cb678-2" data-line-number="2"><span class="co">## PCA</span></a> +<a class="sourceLine" id="cb678-3" data-line-number="3">var.fit <-<span class="st"> </span><span class="kw">suppressWarnings</span>(<span class="kw">trendVar</span>(muraro, <span class="dt">parametric=</span><span class="ot">TRUE</span>, <span class="dt">use.spikes=</span>F))</a> +<a class="sourceLine" id="cb678-4" data-line-number="4">muraro <-<span class="st"> </span><span class="kw">suppressWarnings</span>(<span class="kw">denoisePCA</span>(muraro, <span class="dt">technical=</span>var.fit<span class="op">$</span>trend))</a> +<a class="sourceLine" id="cb678-5" data-line-number="5"><span class="kw">dim</span>(<span class="kw">reducedDim</span>(muraro, <span class="st">"PCA"</span>))</a></code></pre></div> <pre><code>## [1] 2126 5</code></pre> -<div class="sourceCode" id="cb607"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb607-1" data-line-number="1"><span class="co">## Build graph and clustering</span></a> -<a class="sourceLine" id="cb607-2" data-line-number="2">gr <-<span class="st"> </span><span class="kw">buildSNNGraph</span>(muraro, <span class="dt">use.dimred=</span><span class="st">"PCA"</span>, <span class="dt">k =</span> <span class="dv">30</span>)</a> -<a class="sourceLine" id="cb607-3" data-line-number="3">cl <-<span class="st"> </span>igraph<span class="op">::</span><span class="kw">cluster_louvain</span>(gr)<span class="op">$</span>membership</a> -<a class="sourceLine" id="cb607-4" data-line-number="4"><span class="kw">colData</span>(muraro)<span class="op">$</span>cl <-<span class="st"> </span><span class="kw">factor</span>(cl)</a> -<a class="sourceLine" id="cb607-5" data-line-number="5">mclust<span class="op">::</span><span class="kw">adjustedRandIndex</span>(<span class="kw">colData</span>(muraro)<span class="op">$</span>cell_type1, <span class="kw">colData</span>(muraro)<span class="op">$</span>cl)</a></code></pre></div> +<div class="sourceCode" id="cb680"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb680-1" data-line-number="1"><span class="co">## Build graph and clustering</span></a> +<a class="sourceLine" id="cb680-2" data-line-number="2">gr <-<span class="st"> </span><span class="kw">buildSNNGraph</span>(muraro, <span class="dt">use.dimred=</span><span class="st">"PCA"</span>, <span class="dt">k =</span> <span class="dv">30</span>)</a> +<a class="sourceLine" id="cb680-3" data-line-number="3">cl <-<span class="st"> </span>igraph<span class="op">::</span><span class="kw">cluster_louvain</span>(gr)<span class="op">$</span>membership</a> +<a class="sourceLine" id="cb680-4" data-line-number="4"><span class="kw">colData</span>(muraro)<span class="op">$</span>cl <-<span class="st"> </span><span class="kw">factor</span>(cl)</a> +<a class="sourceLine" id="cb680-5" data-line-number="5">mclust<span class="op">::</span><span class="kw">adjustedRandIndex</span>(<span class="kw">colData</span>(muraro)<span class="op">$</span>cell_type1, <span class="kw">colData</span>(muraro)<span class="op">$</span>cl)</a></code></pre></div> <pre><code>## [1] 0.4845618</code></pre> -<div class="sourceCode" id="cb609"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb609-1" data-line-number="1"><span class="kw">table</span>(muraro<span class="op">$</span>cell_type1, cl)</a></code></pre></div> +<div class="sourceCode" id="cb682"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb682-1" data-line-number="1"><span class="kw">table</span>(muraro<span class="op">$</span>cell_type1, cl)</a></code></pre></div> <pre><code>## cl ## 1 2 3 4 5 6 7 8 9 ## acinar 0 0 0 0 0 0 218 0 1 @@ -830,28 +840,39 @@ parts, while if <span class="math inline">\(K\)</span> is too large, clusters mi <div id="example-3.-sc3" class="section level3"> <h3><span class="header-section-number">10.2.3</span> Example 3. SC3</h3> <p>Let’s run <code>SC3</code> clustering on the Deng data. The advantage of the <code>SC3</code> is that it can directly ingest a <code>SingleCellExperiment</code> object.</p> -<p>Now let’s image we do not know the number of clusters <em>k</em> (cell types). <code>SC3</code> can estimate a number of clusters for you:</p> -<div class="sourceCode" id="cb611"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb611-1" data-line-number="1">deng <-<span class="st"> </span><span class="kw">sc3_estimate_k</span>(deng)</a> -<a class="sourceLine" id="cb611-2" data-line-number="2"><span class="kw">metadata</span>(deng)<span class="op">$</span>sc3<span class="op">$</span>k_estimation</a></code></pre></div> -<p>Interestingly, the number of cell types predicted by <code>SC3</code> is smaller than in the original data annotation. However, early, mid and late stages of different cell types together, we will have exactly 6 cell types. We store the merged cell types in <code>cell_type1</code> column of the <code>colData</code> slot:</p> -<div class="sourceCode" id="cb612"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb612-1" data-line-number="1"><span class="kw">plotPCA</span>(deng, <span class="dt">colour_by =</span> <span class="st">"cell_type1"</span>)</a></code></pre></div> -<p>Now we are ready to run <code>SC3</code> (we also ask it to calculate biological properties of the clusters):</p> -<div class="sourceCode" id="cb613"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb613-1" data-line-number="1">deng <-<span class="st"> </span><span class="kw">sc3</span>(deng, <span class="dt">ks =</span> <span class="dv">10</span>, <span class="dt">biology =</span> <span class="ot">TRUE</span>, <span class="dt">n_cores =</span> <span class="dv">1</span>)</a></code></pre></div> +<p><code>SC3</code> can estimate a number of clusters:</p> +<div class="sourceCode" id="cb684"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb684-1" data-line-number="1">deng <-<span class="st"> </span><span class="kw">sc3_estimate_k</span>(deng)</a></code></pre></div> +<pre><code>## Estimating k...</code></pre> +<div class="sourceCode" id="cb686"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb686-1" data-line-number="1"><span class="kw">metadata</span>(deng)<span class="op">$</span>sc3<span class="op">$</span>k_estimation</a></code></pre></div> +<pre><code>## [1] 6</code></pre> +<p>Next we run <code>SC3</code> (we also ask it to calculate biological properties of the clusters):</p> +<div class="sourceCode" id="cb688"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb688-1" data-line-number="1">deng <-<span class="st"> </span><span class="kw">sc3</span>(deng, <span class="dt">ks =</span> <span class="dv">10</span>, <span class="dt">biology =</span> <span class="ot">TRUE</span>, <span class="dt">n_cores =</span> <span class="dv">1</span>)</a></code></pre></div> +<pre><code>## Setting SC3 parameters...</code></pre> +<pre><code>## Calculating distances between the cells...</code></pre> +<pre><code>## Performing transformations and calculating eigenvectors...</code></pre> +<pre><code>## Performing k-means clustering...</code></pre> +<pre><code>## Calculating consensus matrix...</code></pre> +<pre><code>## Calculating biology...</code></pre> <p><code>SC3</code> result consists of several different outputs (please look in <span class="citation">(Kiselev et al. <a href="#ref-Kiselev2016-bq">2017</a>)</span> and <a href="http://bioconductor.org/packages/release/bioc/vignettes/SC3/inst/doc/my-vignette.html">SC3 vignette</a> for more details). Here we show some of them:</p> <p>Consensus matrix:</p> -<div class="sourceCode" id="cb614"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb614-1" data-line-number="1"><span class="kw">sc3_plot_consensus</span>(deng, <span class="dt">k =</span> <span class="dv">10</span>, <span class="dt">show_pdata =</span> <span class="st">"cell_type2"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb695"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb695-1" data-line-number="1"><span class="kw">sc3_plot_consensus</span>(deng, <span class="dt">k =</span> <span class="dv">10</span>, <span class="dt">show_pdata =</span> <span class="st">"cell_type2"</span>)</a></code></pre></div> <p>Silhouette plot:</p> -<div class="sourceCode" id="cb615"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb615-1" data-line-number="1"><span class="kw">sc3_plot_silhouette</span>(deng, <span class="dt">k =</span> <span class="dv">10</span>)</a></code></pre></div> +<div class="sourceCode" id="cb696"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb696-1" data-line-number="1"><span class="kw">sc3_plot_silhouette</span>(deng, <span class="dt">k =</span> <span class="dv">10</span>)</a></code></pre></div> +<p><img src="clustering_files/figure-html/unnamed-chunk-7-1.png" width="672" style="display: block; margin: auto;" /></p> <p>Heatmap of the expression matrix:</p> -<div class="sourceCode" id="cb616"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb616-1" data-line-number="1"><span class="kw">sc3_plot_expression</span>(deng, <span class="dt">k =</span> <span class="dv">10</span>, <span class="dt">show_pdata =</span> <span class="st">"cell_type2"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb697"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb697-1" data-line-number="1"><span class="kw">sc3_plot_expression</span>(deng, <span class="dt">k =</span> <span class="dv">10</span>, <span class="dt">show_pdata =</span> <span class="st">"cell_type2"</span>)</a></code></pre></div> +<p><img src="clustering_files/figure-html/unnamed-chunk-8-1.png" width="672" style="display: block; margin: auto;" /></p> <p>Identified marker genes:</p> -<div class="sourceCode" id="cb617"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb617-1" data-line-number="1"><span class="kw">sc3_plot_markers</span>(deng, <span class="dt">k =</span> <span class="dv">10</span>, <span class="dt">show_pdata =</span> <span class="st">"cell_type2"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb698"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb698-1" data-line-number="1"><span class="kw">sc3_plot_markers</span>(deng, <span class="dt">k =</span> <span class="dv">10</span>, <span class="dt">show_pdata =</span> <span class="st">"cell_type2"</span>)</a></code></pre></div> +<p><img src="clustering_files/figure-html/unnamed-chunk-9-1.png" width="672" style="display: block; margin: auto;" /></p> <p>PCA plot with highlighted <code>SC3</code> clusters:</p> -<div class="sourceCode" id="cb618"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb618-1" data-line-number="1"><span class="kw">plotPCA</span>(deng, <span class="dt">colour_by =</span> <span class="st">"sc3_10_clusters"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb699"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb699-1" data-line-number="1"><span class="kw">plotPCA</span>(deng, <span class="dt">colour_by =</span> <span class="st">"sc3_10_clusters"</span>)</a></code></pre></div> +<p><img src="clustering_files/figure-html/unnamed-chunk-10-1.png" width="672" style="display: block; margin: auto;" /></p> <p>Compare the results of <code>SC3</code> clustering with the original publication cell type labels:</p> -<div class="sourceCode" id="cb619"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb619-1" data-line-number="1"><span class="kw">adjustedRandIndex</span>(<span class="kw">colData</span>(deng)<span class="op">$</span>cell_type2, <span class="kw">colData</span>(deng)<span class="op">$</span>sc3_<span class="dv">10</span>_clusters)</a></code></pre></div> +<div class="sourceCode" id="cb700"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb700-1" data-line-number="1"><span class="kw">adjustedRandIndex</span>(<span class="kw">colData</span>(deng)<span class="op">$</span>cell_type2, <span class="kw">colData</span>(deng)<span class="op">$</span>sc3_<span class="dv">10</span>_clusters)</a></code></pre></div> +<pre><code>## [1] 0.7796181</code></pre> <p><strong>Note</strong> <code>SC3</code> can also be run in an interactive <code>Shiny</code> session:</p> -<div class="sourceCode" id="cb620"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb620-1" data-line-number="1"><span class="kw">sc3_interactive</span>(deng)</a></code></pre></div> +<div class="sourceCode" id="cb702"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb702-1" data-line-number="1"><span class="kw">sc3_interactive</span>(deng)</a></code></pre></div> <p>This command will open <code>SC3</code> in a web browser.</p> <p><strong>Note</strong> Due to direct calculation of distances <code>SC3</code> becomes very slow when the number of cells is <span class="math inline">\(>5000\)</span>. For large datasets containing up to <span class="math inline">\(10^5\)</span> cells we recomment using <code>Seurat</code> (see chapter <a href="seurat-chapter.html#seurat-chapter">16</a>).</p> </div> @@ -860,7 +881,7 @@ parts, while if <span class="math inline">\(K\)</span> is too large, clusters mi <h2><span class="header-section-number">10.3</span> An alternative to clustering: Automatic cell annotation</h2> <div id="singler" class="section level3"> <h3><span class="header-section-number">10.3.1</span> <a href="%22https://bioconductor.org/packages/devel/bioc/html/SingleR.html%22">SingleR</a></h3> -<div id="methodology" class="section level4"> +<div id="methodology-1" class="section level4"> <h4><span class="header-section-number">10.3.1.1</span> Methodology</h4> <ul> <li><strong>Step1. Find variable gene</strong><br /> @@ -892,31 +913,33 @@ We want to know how each cell in the test data is correlated to the labels in th </center></li> <li><strong>Step4. Fine tuning</strong><br /> We stop here and assign each cell with label that score the highest, actually, if we set the argument <code>fine.tune = FALSE</code>, that is exactly what the package function <code>SingleR</code> does. -But there is one more question, what if the second highest score is very close to the highest?</li> +But there is one more question, what if the second highest score is very close to the highest? say, 1, 1, 1, 9.5, 10. +<code>SingleR</code> set a threshold to define how close is “very closeâ€, the default is 0.05. +For (only) the cells that falls into this category, it goes back to Step2.</li> </ul> </div> -<div id="example" class="section level4"> +<div id="example-1" class="section level4"> <h4><span class="header-section-number">10.3.1.2</span> Example</h4> <p>(Note: SingleR is not yet available in the released version of Bioconductor. It will be possible to run it as shown once the next Bioconductor release is made in late October.)</p> -<div class="sourceCode" id="cb621"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb621-1" data-line-number="1"><span class="kw">library</span>(scRNAseq)</a> -<a class="sourceLine" id="cb621-2" data-line-number="2"><span class="kw">library</span>(SingleR)</a> -<a class="sourceLine" id="cb621-3" data-line-number="3">segerstolpe <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/pancreas/segerstolpe.rds"</span>)</a> -<a class="sourceLine" id="cb621-4" data-line-number="4">sceM <-<span class="st"> </span><span class="kw">suppressMessages</span>(<span class="kw">MuraroPancreasData</span>())</a> -<a class="sourceLine" id="cb621-5" data-line-number="5">sceM <-<span class="st"> </span>sceM[,<span class="op">!</span><span class="kw">is.na</span>(sceM<span class="op">$</span>label)]</a> -<a class="sourceLine" id="cb621-6" data-line-number="6">sceM <-<span class="st"> </span><span class="kw">logNormCounts</span>(sceM)</a> -<a class="sourceLine" id="cb621-7" data-line-number="7"><span class="co">## find common gene</span></a> -<a class="sourceLine" id="cb621-8" data-line-number="8"><span class="kw">rownames</span>(sceM) <-<span class="st"> </span><span class="kw">gsub</span>(<span class="st">"__.*"</span>,<span class="st">""</span>,<span class="kw">rownames</span>(sceM))</a> -<a class="sourceLine" id="cb621-9" data-line-number="9">common <-<span class="st"> </span><span class="kw">intersect</span>(<span class="kw">rownames</span>(sceM), <span class="kw">rownames</span>(segerstolpe))</a> -<a class="sourceLine" id="cb621-10" data-line-number="10">sceM <-<span class="st"> </span>sceM[common,]</a> -<a class="sourceLine" id="cb621-11" data-line-number="11">segerstolpe <-<span class="st"> </span>segerstolpe[common,]</a> -<a class="sourceLine" id="cb621-12" data-line-number="12"><span class="co">## Prepare reference</span></a> -<a class="sourceLine" id="cb621-13" data-line-number="13">out <-<span class="st"> </span><span class="kw">pairwiseTTests</span>(<span class="kw">logcounts</span>(sceM), sceM<span class="op">$</span>label, <span class="dt">direction=</span><span class="st">"up"</span>)</a> -<a class="sourceLine" id="cb621-14" data-line-number="14">markers <-<span class="st"> </span><span class="kw">getTopMarkers</span>(out<span class="op">$</span>statistics, out<span class="op">$</span>pairs, <span class="dt">n=</span><span class="dv">10</span>)</a> -<a class="sourceLine" id="cb621-15" data-line-number="15"><span class="co">## Annotation</span></a> -<a class="sourceLine" id="cb621-16" data-line-number="16">pred <-<span class="st"> </span><span class="kw">SingleR</span>(<span class="dt">test=</span>segerstolpe, <span class="dt">ref=</span>sceM, <span class="dt">labels=</span>sceM<span class="op">$</span>label, <span class="dt">genes=</span>markers)</a> -<a class="sourceLine" id="cb621-17" data-line-number="17"><span class="co">## View result </span></a> -<a class="sourceLine" id="cb621-18" data-line-number="18"><span class="kw">plotScoreHeatmap</span>(pred, <span class="dt">show.labels =</span> <span class="ot">TRUE</span>, <span class="dt">annotation_col=</span><span class="kw">data.frame</span>(</a> -<a class="sourceLine" id="cb621-19" data-line-number="19"> <span class="dt">row.names=</span><span class="kw">rownames</span>(pred)))</a></code></pre></div> +<div class="sourceCode" id="cb703"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb703-1" data-line-number="1"><span class="kw">library</span>(scRNAseq)</a> +<a class="sourceLine" id="cb703-2" data-line-number="2"><span class="kw">library</span>(SingleR)</a> +<a class="sourceLine" id="cb703-3" data-line-number="3">segerstolpe <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/pancreas/segerstolpe.rds"</span>)</a> +<a class="sourceLine" id="cb703-4" data-line-number="4">sceM <-<span class="st"> </span><span class="kw">suppressMessages</span>(<span class="kw">MuraroPancreasData</span>())</a> +<a class="sourceLine" id="cb703-5" data-line-number="5">sceM <-<span class="st"> </span>sceM[,<span class="op">!</span><span class="kw">is.na</span>(sceM<span class="op">$</span>label)]</a> +<a class="sourceLine" id="cb703-6" data-line-number="6">sceM <-<span class="st"> </span><span class="kw">logNormCounts</span>(sceM)</a> +<a class="sourceLine" id="cb703-7" data-line-number="7"><span class="co">## find common gene</span></a> +<a class="sourceLine" id="cb703-8" data-line-number="8"><span class="kw">rownames</span>(sceM) <-<span class="st"> </span><span class="kw">gsub</span>(<span class="st">"__.*"</span>,<span class="st">""</span>,<span class="kw">rownames</span>(sceM))</a> +<a class="sourceLine" id="cb703-9" data-line-number="9">common <-<span class="st"> </span><span class="kw">intersect</span>(<span class="kw">rownames</span>(sceM), <span class="kw">rownames</span>(segerstolpe))</a> +<a class="sourceLine" id="cb703-10" data-line-number="10">sceM <-<span class="st"> </span>sceM[common,]</a> +<a class="sourceLine" id="cb703-11" data-line-number="11">segerstolpe <-<span class="st"> </span>segerstolpe[common,]</a> +<a class="sourceLine" id="cb703-12" data-line-number="12"><span class="co">## Prepare reference</span></a> +<a class="sourceLine" id="cb703-13" data-line-number="13">out <-<span class="st"> </span><span class="kw">pairwiseTTests</span>(<span class="kw">logcounts</span>(sceM), sceM<span class="op">$</span>label, <span class="dt">direction=</span><span class="st">"up"</span>)</a> +<a class="sourceLine" id="cb703-14" data-line-number="14">markers <-<span class="st"> </span><span class="kw">getTopMarkers</span>(out<span class="op">$</span>statistics, out<span class="op">$</span>pairs, <span class="dt">n=</span><span class="dv">10</span>)</a> +<a class="sourceLine" id="cb703-15" data-line-number="15"><span class="co">## Annotation</span></a> +<a class="sourceLine" id="cb703-16" data-line-number="16">pred <-<span class="st"> </span><span class="kw">SingleR</span>(<span class="dt">test=</span>segerstolpe, <span class="dt">ref=</span>sceM, <span class="dt">labels=</span>sceM<span class="op">$</span>label, <span class="dt">genes=</span>markers)</a> +<a class="sourceLine" id="cb703-17" data-line-number="17"><span class="co">## View result </span></a> +<a class="sourceLine" id="cb703-18" data-line-number="18"><span class="kw">plotScoreHeatmap</span>(pred, <span class="dt">show.labels =</span> <span class="ot">TRUE</span>, <span class="dt">annotation_col=</span><span class="kw">data.frame</span>(</a> +<a class="sourceLine" id="cb703-19" data-line-number="19"> <span class="dt">row.names=</span><span class="kw">rownames</span>(pred)))</a></code></pre></div> <center> <img src="figures/SingleR.png" style="width:80.0%" /> </center> @@ -924,57 +947,57 @@ But there is one more question, what if the second highest score is very close t </div> <div id="scmap" class="section level3"> <h3><span class="header-section-number">10.3.2</span> <a href="https://bioconductor.org/packages/release/bioc/html/scmap.html">scmap</a></h3> -<div class="sourceCode" id="cb622"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb622-1" data-line-number="1"><span class="co">## Load data</span></a> -<a class="sourceLine" id="cb622-2" data-line-number="2">segerstolpe <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/pancreas/segerstolpe.rds"</span>) <span class="co"># test</span></a> -<a class="sourceLine" id="cb622-3" data-line-number="3"><span class="kw">library</span>(scRNAseq)</a> -<a class="sourceLine" id="cb622-4" data-line-number="4">sceM <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/pancreas/muraro.rds"</span>) <span class="co"># reference</span></a> -<a class="sourceLine" id="cb622-5" data-line-number="5"><span class="kw">rownames</span>(sceM) <-<span class="st"> </span><span class="kw">gsub</span>(<span class="st">"__.*"</span>,<span class="st">""</span>,<span class="kw">rownames</span>(sceM))</a></code></pre></div> +<div class="sourceCode" id="cb704"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb704-1" data-line-number="1"><span class="co">## Load data</span></a> +<a class="sourceLine" id="cb704-2" data-line-number="2">segerstolpe <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/pancreas/segerstolpe.rds"</span>) <span class="co"># test</span></a> +<a class="sourceLine" id="cb704-3" data-line-number="3"><span class="kw">library</span>(scRNAseq)</a> +<a class="sourceLine" id="cb704-4" data-line-number="4">sceM <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/pancreas/muraro.rds"</span>) <span class="co"># reference</span></a> +<a class="sourceLine" id="cb704-5" data-line-number="5"><span class="kw">rownames</span>(sceM) <-<span class="st"> </span><span class="kw">gsub</span>(<span class="st">"__.*"</span>,<span class="st">""</span>,<span class="kw">rownames</span>(sceM))</a></code></pre></div> <p>Select the most informative features (genes) using the dropout feature selection method. By default select 500 features.</p> -<div class="sourceCode" id="cb623"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb623-1" data-line-number="1"><span class="kw">library</span>(scmap)</a> -<a class="sourceLine" id="cb623-2" data-line-number="2"><span class="kw">rowData</span>(sceM)<span class="op">$</span>feature_symbol <-<span class="st"> </span><span class="kw">rownames</span>(sceM)</a> -<a class="sourceLine" id="cb623-3" data-line-number="3">sceM <-<span class="st"> </span><span class="kw">selectFeatures</span>(sceM, <span class="dt">suppress_plot =</span> <span class="ot">TRUE</span>)</a></code></pre></div> +<div class="sourceCode" id="cb705"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb705-1" data-line-number="1"><span class="kw">library</span>(scmap)</a> +<a class="sourceLine" id="cb705-2" data-line-number="2"><span class="kw">rowData</span>(sceM)<span class="op">$</span>feature_symbol <-<span class="st"> </span><span class="kw">rownames</span>(sceM)</a> +<a class="sourceLine" id="cb705-3" data-line-number="3">sceM <-<span class="st"> </span><span class="kw">selectFeatures</span>(sceM, <span class="dt">suppress_plot =</span> <span class="ot">TRUE</span>)</a></code></pre></div> <p>Index of a reference dataset is created by finding the median gene expression for each cluster. First, chop the total of 500 features into <span class="math inline">\(M = 50\)</span> chuncks/ low-dimensional subspace. Second, cluster each chunk into <span class="math inline">\(k = \sqrt{N}\)</span> clusters, where <span class="math inline">\(N\)</span> is the number of cells.</p> <p>By default scmap uses the <code>cell_type1</code> column of the <code>colData</code> slot in the reference to identify clusters.</p> -<div class="sourceCode" id="cb624"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb624-1" data-line-number="1">sceM <-<span class="st"> </span><span class="kw">indexCell</span>(sceM)</a></code></pre></div> +<div class="sourceCode" id="cb706"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb706-1" data-line-number="1">sceM <-<span class="st"> </span><span class="kw">indexCell</span>(sceM)</a></code></pre></div> <p>The function <code>indexCluster</code> writes the <code>scmap_cluster_index</code> item of the meta data slot of the reference dataset <code>sceM</code>. This step has two outputs:</p> -<div class="sourceCode" id="cb625"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb625-1" data-line-number="1"><span class="kw">names</span>(<span class="kw">metadata</span>(sceM)<span class="op">$</span>scmap_cell_index)</a></code></pre></div> +<div class="sourceCode" id="cb707"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb707-1" data-line-number="1"><span class="kw">names</span>(<span class="kw">metadata</span>(sceM)<span class="op">$</span>scmap_cell_index)</a></code></pre></div> <pre><code>## [1] "subcentroids" "subclusters"</code></pre> <ol style="list-style-type: decimal"> <li><code>subcentroids</code> returns cluster centers:</li> </ol> -<div class="sourceCode" id="cb627"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb627-1" data-line-number="1"><span class="kw">cat</span>(<span class="kw">length</span>(<span class="kw">metadata</span>(sceM)<span class="op">$</span>scmap_cell_index<span class="op">$</span>subcentroids), <span class="st">" chunks </span><span class="ch">\n</span><span class="st">"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb709"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb709-1" data-line-number="1"><span class="kw">cat</span>(<span class="kw">length</span>(<span class="kw">metadata</span>(sceM)<span class="op">$</span>scmap_cell_index<span class="op">$</span>subcentroids), <span class="st">" chunks </span><span class="ch">\n</span><span class="st">"</span>)</a></code></pre></div> <pre><code>## 50 chunks</code></pre> -<div class="sourceCode" id="cb629"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb629-1" data-line-number="1"><span class="kw">cat</span>(<span class="st">"The dimension of cluster centers in each chunk: "</span>, <span class="kw">dim</span>(<span class="kw">metadata</span>(sceM)<span class="op">$</span>scmap_cell_index<span class="op">$</span>subcentroids[[<span class="dv">1</span>]]), <span class="st">"</span><span class="ch">\n</span><span class="st">"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb711"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb711-1" data-line-number="1"><span class="kw">cat</span>(<span class="st">"The dimension of cluster centers in each chunk: "</span>, <span class="kw">dim</span>(<span class="kw">metadata</span>(sceM)<span class="op">$</span>scmap_cell_index<span class="op">$</span>subcentroids[[<span class="dv">1</span>]]), <span class="st">"</span><span class="ch">\n</span><span class="st">"</span>)</a></code></pre></div> <pre><code>## The dimension of cluster centers in each chunk: 10 46</code></pre> <ol start="2" style="list-style-type: decimal"> <li><code>subclusters</code> contains information about which cluster (label) the cells belong to</li> </ol> -<div class="sourceCode" id="cb631"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb631-1" data-line-number="1"><span class="kw">dim</span>(<span class="kw">metadata</span>(sceM)<span class="op">$</span>scmap_cell_index<span class="op">$</span>subclusters)</a></code></pre></div> +<div class="sourceCode" id="cb713"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb713-1" data-line-number="1"><span class="kw">dim</span>(<span class="kw">metadata</span>(sceM)<span class="op">$</span>scmap_cell_index<span class="op">$</span>subclusters)</a></code></pre></div> <pre><code>## [1] 50 2126</code></pre> -<div class="sourceCode" id="cb633"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb633-1" data-line-number="1"><span class="kw">metadata</span>(sceM)<span class="op">$</span>scmap_cell_index<span class="op">$</span>subclusters[<span class="dv">1</span><span class="op">:</span><span class="dv">5</span>,<span class="dv">1</span><span class="op">:</span><span class="dv">5</span>]</a></code></pre></div> +<div class="sourceCode" id="cb715"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb715-1" data-line-number="1"><span class="kw">metadata</span>(sceM)<span class="op">$</span>scmap_cell_index<span class="op">$</span>subclusters[<span class="dv">1</span><span class="op">:</span><span class="dv">5</span>,<span class="dv">1</span><span class="op">:</span><span class="dv">5</span>]</a></code></pre></div> <pre><code>## D28.1_1 D28.1_13 D28.1_15 D28.1_17 D28.1_2 -## [1,] 13 25 36 1 29 -## [2,] 7 24 19 17 21 -## [3,] 19 35 7 7 36 -## [4,] 38 27 29 38 41 -## [5,] 8 39 24 40 1</code></pre> +## [1,] 6 11 7 38 36 +## [2,] 1 16 17 44 38 +## [3,] 28 17 4 45 25 +## [4,] 43 41 40 33 22 +## [5,] 36 27 29 11 35</code></pre> <p><strong>Projection:</strong> Once the scmap-cell indexes have been generated we can use them to project the test dataset.</p> -<div class="sourceCode" id="cb635"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb635-1" data-line-number="1">scmapCell_results <-<span class="st"> </span><span class="kw">scmapCell</span>(</a> -<a class="sourceLine" id="cb635-2" data-line-number="2"> <span class="dt">projection =</span> segerstolpe,</a> -<a class="sourceLine" id="cb635-3" data-line-number="3"> <span class="dt">index_list =</span> <span class="kw">list</span>(</a> -<a class="sourceLine" id="cb635-4" data-line-number="4"> <span class="dt">sceM =</span> <span class="kw">metadata</span>(sceM)<span class="op">$</span>scmap_cell_index</a> -<a class="sourceLine" id="cb635-5" data-line-number="5"> ) )</a> -<a class="sourceLine" id="cb635-6" data-line-number="6"><span class="kw">names</span>(scmapCell_results)</a></code></pre></div> +<div class="sourceCode" id="cb717"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb717-1" data-line-number="1">scmapCell_results <-<span class="st"> </span><span class="kw">scmapCell</span>(</a> +<a class="sourceLine" id="cb717-2" data-line-number="2"> <span class="dt">projection =</span> segerstolpe,</a> +<a class="sourceLine" id="cb717-3" data-line-number="3"> <span class="dt">index_list =</span> <span class="kw">list</span>(</a> +<a class="sourceLine" id="cb717-4" data-line-number="4"> <span class="dt">sceM =</span> <span class="kw">metadata</span>(sceM)<span class="op">$</span>scmap_cell_index</a> +<a class="sourceLine" id="cb717-5" data-line-number="5"> ) )</a> +<a class="sourceLine" id="cb717-6" data-line-number="6"><span class="kw">names</span>(scmapCell_results)</a></code></pre></div> <pre><code>## [1] "sceM"</code></pre> <p>The <code>cells</code> matrix contains the top 10 (scmap default) cell IDs of the cells of the reference dataset that a given cell of the projection dataset is closest to:</p> -<div class="sourceCode" id="cb637"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb637-1" data-line-number="1"><span class="kw">dim</span>(scmapCell_results<span class="op">$</span>sceM<span class="op">$</span>cells)</a></code></pre></div> +<div class="sourceCode" id="cb719"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb719-1" data-line-number="1"><span class="kw">dim</span>(scmapCell_results<span class="op">$</span>sceM<span class="op">$</span>cells)</a></code></pre></div> <pre><code>## [1] 10 3514</code></pre> <p><strong>Cell annotation:</strong> If cell cluster annotation is available for the reference datasets, scmap-cell can also annotate the cells from the projection dataset @@ -983,37 +1006,54 @@ neighbours (scmap default) and if they all belong to the same cluster in the reference and their maximum similarity is higher than a threshold (0.5 is the scmap default), then a projection cell is assigned to the corresponding reference cluster:</p> -<div class="sourceCode" id="cb639"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb639-1" data-line-number="1">scmapCell_clusters <-<span class="st"> </span><span class="kw">scmapCell2Cluster</span>(</a> -<a class="sourceLine" id="cb639-2" data-line-number="2"> scmapCell_results,</a> -<a class="sourceLine" id="cb639-3" data-line-number="3"> <span class="kw">list</span>(</a> -<a class="sourceLine" id="cb639-4" data-line-number="4"> <span class="kw">colData</span>(sceM)<span class="op">$</span>cell_type1</a> -<a class="sourceLine" id="cb639-5" data-line-number="5"> ))</a></code></pre></div> +<div class="sourceCode" id="cb721"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb721-1" data-line-number="1">scmapCell_clusters <-<span class="st"> </span><span class="kw">scmapCell2Cluster</span>(</a> +<a class="sourceLine" id="cb721-2" data-line-number="2"> scmapCell_results,</a> +<a class="sourceLine" id="cb721-3" data-line-number="3"> <span class="kw">list</span>(</a> +<a class="sourceLine" id="cb721-4" data-line-number="4"> <span class="kw">colData</span>(sceM)<span class="op">$</span>cell_type1</a> +<a class="sourceLine" id="cb721-5" data-line-number="5"> ))</a></code></pre></div> <p><strong>Plot result </strong> Compare the annotated result with the original label in the <code>segerstolpe</code> dataset.</p> -<div class="sourceCode" id="cb640"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb640-1" data-line-number="1"><span class="kw">plot</span>(</a> -<a class="sourceLine" id="cb640-2" data-line-number="2"> <span class="kw">getSankey</span>(</a> -<a class="sourceLine" id="cb640-3" data-line-number="3"> segerstolpe<span class="op">$</span>cell_type1,</a> -<a class="sourceLine" id="cb640-4" data-line-number="4"> scmapCell_clusters<span class="op">$</span>combined_labs, <span class="dt">plot_height =</span> <span class="dv">400</span> )</a> -<a class="sourceLine" id="cb640-5" data-line-number="5">)</a></code></pre></div> +<div class="sourceCode" id="cb722"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb722-1" data-line-number="1"><span class="kw">plot</span>(</a> +<a class="sourceLine" id="cb722-2" data-line-number="2"> <span class="kw">getSankey</span>(</a> +<a class="sourceLine" id="cb722-3" data-line-number="3"> segerstolpe<span class="op">$</span>cell_type1,</a> +<a class="sourceLine" id="cb722-4" data-line-number="4"> scmapCell_clusters<span class="op">$</span>combined_labs, <span class="dt">plot_height =</span> <span class="dv">400</span> )</a> +<a class="sourceLine" id="cb722-5" data-line-number="5">)</a></code></pre></div> <center> <img src="figures/sankey.png" style="width:80.0%" /> </center> </div> <div id="sessioninfo-6" class="section level3"> <h3><span class="header-section-number">10.3.3</span> sessionInfo()</h3> -<p>Among the 2126 cells in the data, only 89 are annotated as different labels as the</p> </div> </div> </div> <h3> References</h3> <div id="refs" class="references"> +<div id="ref-blondel2008fast"> +<p>Blondel, Vincent D, Jean-Loup Guillaume, Renaud Lambiotte, and Etienne Lefebvre. 2008. “Fast Unfolding of Communities in Large Networks.†<em>Journal of Statistical Mechanics: Theory and Experiment</em> 2008 (10). IOP Publishing: P10008.</p> +</div> <div id="ref-Deng2014-mx"> <p>Deng, Q., D. Ramskold, B. Reinius, and R. Sandberg. 2014. “Single-Cell RNA-Seq Reveals Dynamic, Random Monoallelic Gene Expression in Mammalian Cells.†<em>Science</em> 343 (6167). American Association for the Advancement of Science (AAAS): 193–96. <a href="https://doi.org/10.1126/science.1245316">https://doi.org/10.1126/science.1245316</a>.</p> </div> +<div id="ref-freytag2018comparison"> +<p>Freytag, Saskia, Luyi Tian, Ingrid Lönnstedt, Milica Ng, and Melanie Bahlo. 2018. “Comparison of Clustering Tools in R for Medium-Sized 10x Genomics Single-Cell Rna-Sequencing Data.†<em>F1000Research</em> 7. Faculty of 1000 Ltd.</p> +</div> +<div id="ref-good2010performance"> +<p>Good, Benjamin H, Yves-Alexandre De Montjoye, and Aaron Clauset. 2010. “Performance of Modularity Maximization in Practical Contexts.†<em>Physical Review E</em> 81 (4). APS: 046106.</p> +</div> <div id="ref-Kiselev2016-bq"> <p>Kiselev, Vladimir Yu, Kristina Kirschner, Michael T Schaub, Tallulah Andrews, Andrew Yiu, Tamir Chandra, Kedar N Natarajan, et al. 2017. “SC3: Consensus Clustering of Single-Cell RNA-Seq Data.†<em>Nat Meth</em> 14 (5). Springer Nature: 483–86. <a href="https://doi.org/10.1038/nmeth.4236">https://doi.org/10.1038/nmeth.4236</a>.</p> </div> +<div id="ref-newman2004finding"> +<p>Newman, Mark EJ, and Michelle Girvan. 2004. “Finding and Evaluating Community Structure in Networks.†<em>Physical Review E</em> 69 (2). APS: 026113.</p> +</div> +<div id="ref-traag2019louvain"> +<p>Traag, Vincent A, Ludo Waltman, and Nees Jan van Eck. 2019. “From Louvain to Leiden: Guaranteeing Well-Connected Communities.†<em>Scientific Reports</em> 9. Nature Publishing Group.</p> +</div> +<div id="ref-xu2015identification"> +<p>Xu, Chen, and Zhengchang Su. 2015. “Identification of Cell Types from Single-Cell Transcriptomes Using a Novel Clustering Method.†<em>Bioinformatics</em> 31 (12). Oxford University Press: 1974–80.</p> +</div> </div> </section> diff --git a/public/clustering.md b/public/clustering.md index c7b530a282e92aa99997fb02146df22dd988bc0b..09d9e64b1d943f5940b13815572f77f92d8a9c23 100644 --- a/public/clustering.md +++ b/public/clustering.md @@ -62,11 +62,11 @@ Perform Louvain clustering: ```r cl <- igraph::cluster_louvain(deng15)$membership colData(deng)$cl <- factor(cl) -mclust::adjustedRandIndex(colData(deng)$cell_type1, colData(deng)$cl) +mclust::adjustedRandIndex(colData(deng)$cell_type2, colData(deng)$cl) ``` ``` -## [1] 0.8248454 +## [1] 0.4197754 ``` Reaches very high similarity with the labels provided in the original paper. @@ -74,18 +74,22 @@ However, it tend to merge small clusters into larger ones. ```r -table(deng$cell_type1, cl) +table(deng$cell_type2, cl) ``` ``` -## cl -## 1 2 3 -## 16cell 49 0 1 -## 2cell 0 22 0 -## 4cell 0 14 0 -## 8cell 36 0 1 -## blast 0 0 133 -## zygote 0 12 0 +## cl +## 1 2 3 +## 16cell 49 0 1 +## 4cell 0 14 0 +## 8cell 36 0 1 +## early2cell 0 8 0 +## earlyblast 0 0 43 +## late2cell 0 10 0 +## lateblast 0 0 30 +## mid2cell 0 12 0 +## midblast 0 0 60 +## zy 0 4 0 ``` @@ -141,25 +145,55 @@ table(muraro$cell_type1, cl) Let's run `SC3` clustering on the Deng data. The advantage of the `SC3` is that it can directly ingest a `SingleCellExperiment` object. -Now let's image we do not know the number of clusters _k_ (cell types). `SC3` can estimate a number of clusters for you: +`SC3` can estimate a number of clusters: ```r deng <- sc3_estimate_k(deng) -metadata(deng)$sc3$k_estimation ``` -Interestingly, the number of cell types predicted by `SC3` is smaller than in the original data annotation. However, early, mid and late stages of different cell types together, we will have exactly 6 cell types. We store the merged cell types in `cell_type1` column of the `colData` slot: +``` +## Estimating k... +``` ```r -plotPCA(deng, colour_by = "cell_type1") +metadata(deng)$sc3$k_estimation +``` + +``` +## [1] 6 ``` -Now we are ready to run `SC3` (we also ask it to calculate biological properties of the clusters): + +Next we run `SC3` (we also ask it to calculate biological properties of the clusters): ```r deng <- sc3(deng, ks = 10, biology = TRUE, n_cores = 1) ``` +``` +## Setting SC3 parameters... +``` + +``` +## Calculating distances between the cells... +``` + +``` +## Performing transformations and calculating eigenvectors... +``` + +``` +## Performing k-means clustering... +``` + +``` +## Calculating consensus matrix... +``` + +``` +## Calculating biology... +``` + `SC3` result consists of several different outputs (please look in [@Kiselev2016-bq] and [SC3 vignette](http://bioconductor.org/packages/release/bioc/vignettes/SC3/inst/doc/my-vignette.html) for more details). Here we show some of them: Consensus matrix: @@ -174,30 +208,42 @@ Silhouette plot: sc3_plot_silhouette(deng, k = 10) ``` +<img src="clustering_files/figure-html/unnamed-chunk-7-1.png" width="672" style="display: block; margin: auto;" /> + Heatmap of the expression matrix: ```r sc3_plot_expression(deng, k = 10, show_pdata = "cell_type2") ``` +<img src="clustering_files/figure-html/unnamed-chunk-8-1.png" width="672" style="display: block; margin: auto;" /> + Identified marker genes: ```r sc3_plot_markers(deng, k = 10, show_pdata = "cell_type2") ``` +<img src="clustering_files/figure-html/unnamed-chunk-9-1.png" width="672" style="display: block; margin: auto;" /> + PCA plot with highlighted `SC3` clusters: ```r plotPCA(deng, colour_by = "sc3_10_clusters") ``` +<img src="clustering_files/figure-html/unnamed-chunk-10-1.png" width="672" style="display: block; margin: auto;" /> + Compare the results of `SC3` clustering with the original publication cell type labels: ```r adjustedRandIndex(colData(deng)$cell_type2, colData(deng)$sc3_10_clusters) ``` +``` +## [1] 0.7796181 +``` + __Note__ `SC3` can also be run in an interactive `Shiny` session: ```r @@ -233,7 +279,9 @@ __Note__ Due to direct calculation of distances `SC3` becomes very slow when the <center> {width=80%} </center> - __Step4. Fine tuning__\ We stop here and assign each cell with label that score the highest, actually, if we set the argument ```fine.tune = FALSE```, that is exactly what the package function ```SingleR``` does. - But there is one more question, what if the second highest score is very close to the highest? + But there is one more question, what if the second highest score is very close to the highest? say, 1, 1, 1, 9.5, 10. + `SingleR` set a threshold to define how close is "very close", the default is 0.05. + For (only) the cells that falls into this category, it goes back to Step2. #### Example @@ -345,11 +393,11 @@ metadata(sceM)$scmap_cell_index$subclusters[1:5,1:5] ``` ## D28.1_1 D28.1_13 D28.1_15 D28.1_17 D28.1_2 -## [1,] 13 25 36 1 29 -## [2,] 7 24 19 17 21 -## [3,] 19 35 7 7 36 -## [4,] 38 27 29 38 41 -## [5,] 8 39 24 40 1 +## [1,] 6 11 7 38 36 +## [2,] 1 16 17 44 38 +## [3,] 28 17 4 45 25 +## [4,] 43 41 40 33 22 +## [5,] 36 27 29 11 35 ``` @@ -418,4 +466,4 @@ plot( -Among the 2126 cells in the data, only 89 are annotated as different labels as the + diff --git a/public/clustering_files/figure-html/unnamed-chunk-10-1.png b/public/clustering_files/figure-html/unnamed-chunk-10-1.png index e130523d7a53f39d853284b2a355bd76da7d9b00..6d5ccf4e847a820e02ffd5c35a267e31f4cbafd9 100644 Binary files a/public/clustering_files/figure-html/unnamed-chunk-10-1.png and b/public/clustering_files/figure-html/unnamed-chunk-10-1.png differ diff --git a/public/clustering_files/figure-html/unnamed-chunk-2-1.png b/public/clustering_files/figure-html/unnamed-chunk-2-1.png index de5aa000ac400b3006d6b78cd3eaaa1c200bd523..fb9fc98939683939dc9b953f5ce9a2c8340f04e5 100644 Binary files a/public/clustering_files/figure-html/unnamed-chunk-2-1.png and b/public/clustering_files/figure-html/unnamed-chunk-2-1.png differ diff --git a/public/clustering_files/figure-html/unnamed-chunk-7-1.png b/public/clustering_files/figure-html/unnamed-chunk-7-1.png new file mode 100644 index 0000000000000000000000000000000000000000..e1eb136d338f9507677800bd918dcb6486643922 Binary files /dev/null and b/public/clustering_files/figure-html/unnamed-chunk-7-1.png differ diff --git a/public/clustering_files/figure-html/unnamed-chunk-8-1.png b/public/clustering_files/figure-html/unnamed-chunk-8-1.png index e17266be68f84b099b34324ada34f6c519b8ecae..b442cc1c2c85d01b25ec19b30d93415207c1768d 100644 Binary files a/public/clustering_files/figure-html/unnamed-chunk-8-1.png and b/public/clustering_files/figure-html/unnamed-chunk-8-1.png differ diff --git a/public/clustering_files/figure-html/unnamed-chunk-9-1.png b/public/clustering_files/figure-html/unnamed-chunk-9-1.png index 6a5c51c872523efab544c2d789af4924b4877968..e36201544146e0c7103b14cacda59b928c43f500 100644 Binary files a/public/clustering_files/figure-html/unnamed-chunk-9-1.png and b/public/clustering_files/figure-html/unnamed-chunk-9-1.png differ diff --git a/public/comparing-and-combining-scrna-seq-datasets.html b/public/comparing-and-combining-scrna-seq-datasets.html index 9f190e2fdd35ae3928207bcc0405b53078b70e3e..832c937dd48b51181a72959fbb9e64bf7b314362 100644 --- a/public/comparing-and-combining-scrna-seq-datasets.html +++ b/public/comparing-and-combining-scrna-seq-datasets.html @@ -339,7 +339,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.4.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-3"><i class="fa fa-check"></i><b>7.4.6</b> sessionInfo()</a></li> </ul></li> <li class="chapter" data-level="7.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#identifying-confounding-factors-reads"><i class="fa fa-check"></i><b>7.5</b> Identifying confounding factors (Reads)</a></li> -<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#dealing-with-confounders"><i class="fa fa-check"></i><b>7.6</b> Dealing with confounders</a><ul> +<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#batch-effects"><i class="fa fa-check"></i><b>7.6</b> Batch effects</a><ul> <li class="chapter" data-level="7.6.1" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#introduction-6"><i class="fa fa-check"></i><b>7.6.1</b> Introduction</a></li> <li class="chapter" data-level="7.6.2" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#linear-models"><i class="fa fa-check"></i><b>7.6.2</b> Linear models</a></li> <li class="chapter" data-level="7.6.3" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sctransform-2"><i class="fa fa-check"></i><b>7.6.3</b> sctransform</a></li> @@ -347,7 +347,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.6.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#combat"><i class="fa fa-check"></i><b>7.6.5</b> Combat</a></li> <li class="chapter" data-level="7.6.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#mnncorrect"><i class="fa fa-check"></i><b>7.6.6</b> mnnCorrect</a></li> <li class="chapter" data-level="7.6.7" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#harmony"><i class="fa fa-check"></i><b>7.6.7</b> Harmony</a></li> -<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-confounder-removal-strategies"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare confounder removal strategies</a></li> +<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-batch-correction"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare batch correction</a></li> <li class="chapter" data-level="7.6.9" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#big-exercise-2"><i class="fa fa-check"></i><b>7.6.9</b> Big Exercise</a></li> <li class="chapter" data-level="7.6.10" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-4"><i class="fa fa-check"></i><b>7.6.10</b> sessionInfo()</a></li> </ul></li> @@ -370,14 +370,13 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="9.1.2" data-path="latent-spaces.html"><a href="latent-spaces.html#tsne-t-distributed-stochastic-neighbor-embedding"><i class="fa fa-check"></i><b>9.1.2</b> tSNE: t-Distributed Stochastic Neighbor Embedding</a></li> <li class="chapter" data-level="9.1.3" data-path="latent-spaces.html"><a href="latent-spaces.html#manifold-methods"><i class="fa fa-check"></i><b>9.1.3</b> Manifold methods</a></li> </ul></li> -<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a></li> +<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a><ul> +<li class="chapter" data-level="9.2.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom-interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.2.1</b> <span>Slalom</span>: Interpretable latent spaces</a></li> +</ul></li> <li class="chapter" data-level="9.3" data-path="latent-spaces.html"><a href="latent-spaces.html#autoencoders"><i class="fa fa-check"></i><b>9.3</b> Autoencoders</a><ul> <li class="chapter" data-level="9.3.1" data-path="latent-spaces.html"><a href="latent-spaces.html#background-and-some-notations"><i class="fa fa-check"></i><b>9.3.1</b> Background and some notations</a></li> <li class="chapter" data-level="9.3.2" data-path="latent-spaces.html"><a href="latent-spaces.html#objective"><i class="fa fa-check"></i><b>9.3.2</b> Objective</a></li> </ul></li> -<li class="chapter" data-level="9.4" data-path="latent-spaces.html"><a href="latent-spaces.html#interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.4</b> Interpretable latent spaces</a><ul> -<li class="chapter" data-level="9.4.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom"><i class="fa fa-check"></i><b>9.4.1</b> Slalom</a></li> -</ul></li> </ul></li> <li class="chapter" data-level="10" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html"><i class="fa fa-check"></i><b>10</b> Clustering and cell annotation</a><ul> <li class="chapter" data-level="10.1" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html#clustering-methods"><i class="fa fa-check"></i><b>10.1</b> Clustering Methods</a><ul> @@ -399,15 +398,16 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="11.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#first-look-at-deng-data"><i class="fa fa-check"></i><b>11.1</b> First look at Deng data</a><ul> <li class="chapter" data-level="11.1.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#tscan"><i class="fa fa-check"></i><b>11.1.1</b> TSCAN</a></li> <li class="chapter" data-level="11.1.2" data-path="trajectory-inference.html"><a href="trajectory-inference.html#slingshot"><i class="fa fa-check"></i><b>11.1.2</b> Slingshot</a></li> -<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.3</b> Monocle</a></li> -<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.4</b> Monocle 2</a></li> -<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 3</a></li> -<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.6</b> Diffusion maps</a></li> -<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.7</b> Other methods</a></li> -<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.8</b> Comparison of the methods</a></li> -<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.9</b> Expression of genes through time</a></li> -<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.10</b> dynverse</a></li> -<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.11</b> sessionInfo()</a></li> +<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#gam-general-additive-model-for-identifying-temporally-expressed-genes"><i class="fa fa-check"></i><b>11.1.3</b> GAM general additive model for identifying temporally expressed genes</a></li> +<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.4</b> Monocle</a></li> +<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 2</a></li> +<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.6</b> Monocle 3</a></li> +<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.7</b> Diffusion maps</a></li> +<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.8</b> Other methods</a></li> +<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.9</b> Comparison of the methods</a></li> +<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.10</b> Expression of genes through time</a></li> +<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.11</b> dynverse</a></li> +<li class="chapter" data-level="11.1.12" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.12</b> sessionInfo()</a></li> </ul></li> </ul></li> <li class="chapter" data-level="12" data-path="dechapter.html"><a href="dechapter.html"><i class="fa fa-check"></i><b>12</b> Differential Expression (DE) analysis</a><ul> @@ -506,8 +506,8 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <section class="normal" id="section-"> <div id="comparing-and-combining-scrna-seq-datasets" class="section level1"> <h1><span class="header-section-number">14</span> Comparing and combining scRNA-seq datasets</h1> -<div class="sourceCode" id="cb732"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb732-1" data-line-number="1"><span class="kw">library</span>(scater)</a> -<a class="sourceLine" id="cb732-2" data-line-number="2"><span class="kw">library</span>(SingleCellExperiment)</a></code></pre></div> +<div class="sourceCode" id="cb815"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb815-1" data-line-number="1"><span class="kw">library</span>(scater)</a> +<a class="sourceLine" id="cb815-2" data-line-number="2"><span class="kw">library</span>(SingleCellExperiment)</a></code></pre></div> <div id="introduction-9" class="section level3"> <h3><span class="header-section-number">14.0.1</span> Introduction</h3> <p>As more and more scRNA-seq datasets become available, carrying merged_seurat comparisons between them is key. There are two main @@ -527,21 +527,21 @@ consistent.</p> <div id="datasets-1" class="section level3"> <h3><span class="header-section-number">14.0.2</span> Datasets</h3> <p>We will running these methods on two human pancreas datasets: <span class="citation">(Muraro et al. <a href="#ref-Muraro2016-yk">2016</a>)</span> and <span class="citation">(Segerstolpe et al. <a href="#ref-Segerstolpe2016-wc">2016</a>)</span>. Since the pancreas has been widely studied, these datasets are well annotated.</p> -<div class="sourceCode" id="cb733"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb733-1" data-line-number="1">muraro <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/pancreas/muraro.rds"</span>)</a> -<a class="sourceLine" id="cb733-2" data-line-number="2">segerstolpe <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/pancreas/segerstolpe.rds"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb816"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb816-1" data-line-number="1">muraro <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/pancreas/muraro.rds"</span>)</a> +<a class="sourceLine" id="cb816-2" data-line-number="2">segerstolpe <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/pancreas/segerstolpe.rds"</span>)</a></code></pre></div> <p>This data has already been formatted for scmap. Cell type labels must be stored in the <code>cell_type1</code> column of the <code>colData</code> slots, and gene ids that are consistent across both datasets must be stored in the <code>feature_symbol</code> column of the <code>rowData</code> slots.</p> <p>First, lets check our gene-ids match across both datasets:</p> -<div class="sourceCode" id="cb734"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb734-1" data-line-number="1"><span class="kw">sum</span>(<span class="kw">rowData</span>(muraro)<span class="op">$</span>feature_symbol <span class="op">%in%</span><span class="st"> </span><span class="kw">rowData</span>(segerstolpe)<span class="op">$</span>feature_symbol)<span class="op">/</span><span class="kw">nrow</span>(muraro)</a> -<a class="sourceLine" id="cb734-2" data-line-number="2"><span class="kw">sum</span>(<span class="kw">rowData</span>(segerstolpe)<span class="op">$</span>feature_symbol <span class="op">%in%</span><span class="st"> </span><span class="kw">rowData</span>(muraro)<span class="op">$</span>feature_symbol)<span class="op">/</span><span class="kw">nrow</span>(segerstolpe)</a></code></pre></div> +<div class="sourceCode" id="cb817"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb817-1" data-line-number="1"><span class="kw">sum</span>(<span class="kw">rowData</span>(muraro)<span class="op">$</span>feature_symbol <span class="op">%in%</span><span class="st"> </span><span class="kw">rowData</span>(segerstolpe)<span class="op">$</span>feature_symbol)<span class="op">/</span><span class="kw">nrow</span>(muraro)</a> +<a class="sourceLine" id="cb817-2" data-line-number="2"><span class="kw">sum</span>(<span class="kw">rowData</span>(segerstolpe)<span class="op">$</span>feature_symbol <span class="op">%in%</span><span class="st"> </span><span class="kw">rowData</span>(muraro)<span class="op">$</span>feature_symbol)<span class="op">/</span><span class="kw">nrow</span>(segerstolpe)</a></code></pre></div> <p>Here we can see that 96% of the genes present in muraro match genes in segerstople and 72% of genes in segerstolpe are match genes in muraro. This is as expected because the segerstolpe dataset was more deeply sequenced than the muraro dataset. However, it highlights some of the difficulties in comparing scRNASeq datasets.</p> <p>We can confirm this by checking the overall size of these two datasets.</p> -<div class="sourceCode" id="cb735"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb735-1" data-line-number="1"><span class="kw">dim</span>(muraro)</a> -<a class="sourceLine" id="cb735-2" data-line-number="2"><span class="kw">dim</span>(segerstolpe)</a></code></pre></div> +<div class="sourceCode" id="cb818"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb818-1" data-line-number="1"><span class="kw">dim</span>(muraro)</a> +<a class="sourceLine" id="cb818-2" data-line-number="2"><span class="kw">dim</span>(segerstolpe)</a></code></pre></div> <p>In addition, we can check the cell-type annotations for each of these dataset using the command below:</p> -<div class="sourceCode" id="cb736"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb736-1" data-line-number="1"><span class="kw">summary</span>(<span class="kw">factor</span>(<span class="kw">colData</span>(muraro)<span class="op">$</span>cell_type1))</a> -<a class="sourceLine" id="cb736-2" data-line-number="2"><span class="kw">summary</span>(<span class="kw">factor</span>(<span class="kw">colData</span>(segerstolpe)<span class="op">$</span>cell_type1))</a></code></pre></div> +<div class="sourceCode" id="cb819"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb819-1" data-line-number="1"><span class="kw">summary</span>(<span class="kw">factor</span>(<span class="kw">colData</span>(muraro)<span class="op">$</span>cell_type1))</a> +<a class="sourceLine" id="cb819-2" data-line-number="2"><span class="kw">summary</span>(<span class="kw">factor</span>(<span class="kw">colData</span>(segerstolpe)<span class="op">$</span>cell_type1))</a></code></pre></div> <p>Here we can see that even though both datasets considered the same biological tissue the two datasets, they have been annotated with slightly different sets of cell-types. If you are familiar withpancreas biology you might recognize that the pancreatic stellate cells (PSCs) in segerstolpe are a type of mesenchymal stem cell which would fall under @@ -553,14 +553,14 @@ we could attempt to infer which of the existing annotations they most likely bel to uncover a novel cell-type among them (or a sub-type within the existing annotations) using cross-dataset normalization.</p> <p>To simplify our demonstration analyses we will remove the small classes of unassigned cells, and the poor quality cells. We will retain the “unclassified endocrine†to see if any of these methods can elucidate what cell-type they belong to.</p> -<div class="sourceCode" id="cb737"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb737-1" data-line-number="1">segerstolpe <-<span class="st"> </span>segerstolpe[,<span class="kw">colData</span>(segerstolpe)<span class="op">$</span>cell_type1 <span class="op">!=</span><span class="st"> "unclassified"</span>]</a> -<a class="sourceLine" id="cb737-2" data-line-number="2">segerstolpe <-<span class="st"> </span>segerstolpe[,<span class="kw">colData</span>(segerstolpe)<span class="op">$</span>cell_type1 <span class="op">!=</span><span class="st"> "not applicable"</span>,]</a> -<a class="sourceLine" id="cb737-3" data-line-number="3">muraro <-<span class="st"> </span>muraro[,<span class="kw">colData</span>(muraro)<span class="op">$</span>cell_type1 <span class="op">!=</span><span class="st"> "unclear"</span>]</a></code></pre></div> +<div class="sourceCode" id="cb820"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb820-1" data-line-number="1">segerstolpe <-<span class="st"> </span>segerstolpe[,<span class="kw">colData</span>(segerstolpe)<span class="op">$</span>cell_type1 <span class="op">!=</span><span class="st"> "unclassified"</span>]</a> +<a class="sourceLine" id="cb820-2" data-line-number="2">segerstolpe <-<span class="st"> </span>segerstolpe[,<span class="kw">colData</span>(segerstolpe)<span class="op">$</span>cell_type1 <span class="op">!=</span><span class="st"> "not applicable"</span>,]</a> +<a class="sourceLine" id="cb820-3" data-line-number="3">muraro <-<span class="st"> </span>muraro[,<span class="kw">colData</span>(muraro)<span class="op">$</span>cell_type1 <span class="op">!=</span><span class="st"> "unclear"</span>]</a></code></pre></div> </div> <div id="projecting-cells-onto-annotated-cell-types-scmap" class="section level3"> <h3><span class="header-section-number">14.0.3</span> Projecting cells onto annotated cell-types (scmap)</h3> -<div class="sourceCode" id="cb738"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb738-1" data-line-number="1"><span class="kw">library</span>(scmap)</a> -<a class="sourceLine" id="cb738-2" data-line-number="2"><span class="kw">set.seed</span>(<span class="dv">1234567</span>)</a></code></pre></div> +<div class="sourceCode" id="cb821"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb821-1" data-line-number="1"><span class="kw">library</span>(scmap)</a> +<a class="sourceLine" id="cb821-2" data-line-number="2"><span class="kw">set.seed</span>(<span class="dv">1234567</span>)</a></code></pre></div> <p>We recently developed <code>scmap</code> <span class="citation">(Kiselev and Hemberg <a href="#ref-Kiselev2017-nb">2017</a>)</span> - a method for projecting cells from a scRNA-seq experiment onto the cell-types identified in other experiments. Additionally, a cloud version of <code>scmap</code> can be run for free, withmerged_seurat restrictions, from <a href="http://www.hemberg-lab.cloud/scmap">http://www.hemberg-lab.cloud/scmap</a>.</p> <div id="feature-selection-1" class="section level4"> <h4><span class="header-section-number">14.0.3.1</span> Feature Selection</h4> @@ -568,15 +568,15 @@ see if any of these methods can elucidate what cell-type they belong to.</p> clusters. Since we want to know whether PSCs and mesenchymal cells are synonymous we will project each dataset to the other so we will build an index for each dataset. This requires first selecting the most informative features for the reference dataset.</p> -<div class="sourceCode" id="cb739"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb739-1" data-line-number="1">muraro <-<span class="st"> </span><span class="kw">selectFeatures</span>(muraro, <span class="dt">suppress_plot =</span> <span class="ot">FALSE</span>)</a></code></pre></div> +<div class="sourceCode" id="cb822"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb822-1" data-line-number="1">muraro <-<span class="st"> </span><span class="kw">selectFeatures</span>(muraro, <span class="dt">suppress_plot =</span> <span class="ot">FALSE</span>)</a></code></pre></div> <p>Genes highlighted with the red colour will be used in the futher analysis (projection).</p> -<div class="sourceCode" id="cb740"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb740-1" data-line-number="1">segerstolpe <-<span class="st"> </span><span class="kw">selectFeatures</span>(segerstolpe, <span class="dt">suppress_plot =</span> <span class="ot">FALSE</span>)</a></code></pre></div> +<div class="sourceCode" id="cb823"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb823-1" data-line-number="1">segerstolpe <-<span class="st"> </span><span class="kw">selectFeatures</span>(segerstolpe, <span class="dt">suppress_plot =</span> <span class="ot">FALSE</span>)</a></code></pre></div> <p>From the y-axis of these plots we can see that scmap uses a dropmerged_seurat-based feature selection method.</p> <p>Now calculate the cell-type index:</p> -<div class="sourceCode" id="cb741"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb741-1" data-line-number="1">muraro <-<span class="st"> </span><span class="kw">indexCluster</span>(muraro)</a> -<a class="sourceLine" id="cb741-2" data-line-number="2">segerstolpe <-<span class="st"> </span><span class="kw">indexCluster</span>(segerstolpe)</a></code></pre></div> +<div class="sourceCode" id="cb824"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb824-1" data-line-number="1">muraro <-<span class="st"> </span><span class="kw">indexCluster</span>(muraro)</a> +<a class="sourceLine" id="cb824-2" data-line-number="2">segerstolpe <-<span class="st"> </span><span class="kw">indexCluster</span>(segerstolpe)</a></code></pre></div> <p>We can also visualize the index:</p> -<div class="sourceCode" id="cb742"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb742-1" data-line-number="1"><span class="kw">heatmap</span>(<span class="kw">as.matrix</span>(<span class="kw">metadata</span>(muraro)<span class="op">$</span>scmap_cluster_index))</a></code></pre></div> +<div class="sourceCode" id="cb825"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb825-1" data-line-number="1"><span class="kw">heatmap</span>(<span class="kw">as.matrix</span>(<span class="kw">metadata</span>(muraro)<span class="op">$</span>scmap_cluster_index))</a></code></pre></div> <p>You may want to adjust your features using the <code>setFeatures</code> function if features are too heavily concentrated in only a few cell-types. In this case the dropmerged_seurat-based features look good so we will just them.</p> <p><strong>Exercise</strong> Using the rowData of each dataset how many genes were selected as features in both datasets? What does this tell you abmerged_seurat these datasets?</p> @@ -586,27 +586,27 @@ Using the rowData of each dataset how many genes were selected as features in bo <h4><span class="header-section-number">14.0.3.2</span> Projecting</h4> <p>scmap computes the distance from each cell to each cell-type in the reference index, then applies an empirically derived threshold to determine which cells are assigned to the closest reference cell-type and which are unassigned. To account for differences in sequencing depth distance is calculated using the spearman correlation and cosine distance and only cells with a consistent assignment with both distances are returned as assigned.</p> <p>We will project the <code>segerstolpe</code> dataset to <code>muraro</code> dataset:</p> -<div class="sourceCode" id="cb743"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb743-1" data-line-number="1">seger_to_muraro <-<span class="st"> </span><span class="kw">scmapCluster</span>(</a> -<a class="sourceLine" id="cb743-2" data-line-number="2"> <span class="dt">projection =</span> segerstolpe,</a> -<a class="sourceLine" id="cb743-3" data-line-number="3"> <span class="dt">index_list =</span> <span class="kw">list</span>(</a> -<a class="sourceLine" id="cb743-4" data-line-number="4"> <span class="dt">muraro =</span> <span class="kw">metadata</span>(muraro)<span class="op">$</span>scmap_cluster_index</a> -<a class="sourceLine" id="cb743-5" data-line-number="5"> )</a> -<a class="sourceLine" id="cb743-6" data-line-number="6">)</a></code></pre></div> +<div class="sourceCode" id="cb826"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb826-1" data-line-number="1">seger_to_muraro <-<span class="st"> </span><span class="kw">scmapCluster</span>(</a> +<a class="sourceLine" id="cb826-2" data-line-number="2"> <span class="dt">projection =</span> segerstolpe,</a> +<a class="sourceLine" id="cb826-3" data-line-number="3"> <span class="dt">index_list =</span> <span class="kw">list</span>(</a> +<a class="sourceLine" id="cb826-4" data-line-number="4"> <span class="dt">muraro =</span> <span class="kw">metadata</span>(muraro)<span class="op">$</span>scmap_cluster_index</a> +<a class="sourceLine" id="cb826-5" data-line-number="5"> )</a> +<a class="sourceLine" id="cb826-6" data-line-number="6">)</a></code></pre></div> <p>and <code>muraro</code> onto <code>segerstolpe</code></p> -<div class="sourceCode" id="cb744"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb744-1" data-line-number="1">muraro_to_seger <-<span class="st"> </span><span class="kw">scmapCluster</span>(</a> -<a class="sourceLine" id="cb744-2" data-line-number="2"> <span class="dt">projection =</span> muraro,</a> -<a class="sourceLine" id="cb744-3" data-line-number="3"> <span class="dt">index_list =</span> <span class="kw">list</span>(</a> -<a class="sourceLine" id="cb744-4" data-line-number="4"> <span class="dt">seger =</span> <span class="kw">metadata</span>(segerstolpe)<span class="op">$</span>scmap_cluster_index</a> -<a class="sourceLine" id="cb744-5" data-line-number="5"> )</a> -<a class="sourceLine" id="cb744-6" data-line-number="6">)</a></code></pre></div> +<div class="sourceCode" id="cb827"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb827-1" data-line-number="1">muraro_to_seger <-<span class="st"> </span><span class="kw">scmapCluster</span>(</a> +<a class="sourceLine" id="cb827-2" data-line-number="2"> <span class="dt">projection =</span> muraro,</a> +<a class="sourceLine" id="cb827-3" data-line-number="3"> <span class="dt">index_list =</span> <span class="kw">list</span>(</a> +<a class="sourceLine" id="cb827-4" data-line-number="4"> <span class="dt">seger =</span> <span class="kw">metadata</span>(segerstolpe)<span class="op">$</span>scmap_cluster_index</a> +<a class="sourceLine" id="cb827-5" data-line-number="5"> )</a> +<a class="sourceLine" id="cb827-6" data-line-number="6">)</a></code></pre></div> <p>Note that in each case we are projecting to a single dataset but that this could be extended to any number of datasets for which we have computed indices.</p> <p>Now lets compare the original cell-type labels with the projected labels:</p> -<div class="sourceCode" id="cb745"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb745-1" data-line-number="1"><span class="kw">table</span>(<span class="kw">colData</span>(muraro)<span class="op">$</span>cell_type1, muraro_to_seger<span class="op">$</span>scmap_cluster_labs)</a></code></pre></div> +<div class="sourceCode" id="cb828"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb828-1" data-line-number="1"><span class="kw">table</span>(<span class="kw">colData</span>(muraro)<span class="op">$</span>cell_type1, muraro_to_seger<span class="op">$</span>scmap_cluster_labs)</a></code></pre></div> <p>Here we can see that cell-types do map to their equivalents in segerstolpe, and importantly we see that all but one of the “mesenchymal†cells were assigned to the “PSC†class.</p> -<div class="sourceCode" id="cb746"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb746-1" data-line-number="1"><span class="kw">table</span>(<span class="kw">colData</span>(segerstolpe)<span class="op">$</span>cell_type1, seger_to_muraro<span class="op">$</span>scmap_cluster_labs)</a></code></pre></div> +<div class="sourceCode" id="cb829"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb829-1" data-line-number="1"><span class="kw">table</span>(<span class="kw">colData</span>(segerstolpe)<span class="op">$</span>cell_type1, seger_to_muraro<span class="op">$</span>scmap_cluster_labs)</a></code></pre></div> <p>Again we see cell-types match each other and that all but one of the “PSCs†match the “mesenchymal†cells providing strong evidence that these two annotations should be considered synonymous.</p> <p>We can also visualize these tables using a <a href="https://developers.google.com/chart/interactive/docs/gallery/sankey">Sankey diagram</a>:</p> -<div class="sourceCode" id="cb747"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb747-1" data-line-number="1"><span class="kw">plot</span>(<span class="kw">getSankey</span>(<span class="kw">colData</span>(muraro)<span class="op">$</span>cell_type1, muraro_to_seger<span class="op">$</span>scmap_cluster_labs[,<span class="dv">1</span>], <span class="dt">plot_height=</span><span class="dv">400</span>))</a></code></pre></div> +<div class="sourceCode" id="cb830"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb830-1" data-line-number="1"><span class="kw">plot</span>(<span class="kw">getSankey</span>(<span class="kw">colData</span>(muraro)<span class="op">$</span>cell_type1, muraro_to_seger<span class="op">$</span>scmap_cluster_labs[,<span class="dv">1</span>], <span class="dt">plot_height=</span><span class="dv">400</span>))</a></code></pre></div> <p><strong>Exercise</strong> How many of the previously unclassified cells would be be able to assign to cell-types using scmap?</p> <p><strong>Answer</strong></p> @@ -618,29 +618,29 @@ How many of the previously unclassified cells would be be able to assign to cell uses a highly optimized search algorithm allowing it to be scaled to very large references (in theory 100,000-millions of cells). However, this process is stochastic so we must fix the random seed to ensure we can reproduce our results.</p> <p>We have already performed feature selection for this dataset so we can go straight to building the index.</p> -<div class="sourceCode" id="cb748"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb748-1" data-line-number="1"><span class="kw">set.seed</span>(<span class="dv">193047</span>)</a> -<a class="sourceLine" id="cb748-2" data-line-number="2">segerstolpe <-<span class="st"> </span><span class="kw">indexCell</span>(segerstolpe)</a> -<a class="sourceLine" id="cb748-3" data-line-number="3">muraro <-<span class="st"> </span><span class="kw">indexCell</span>(muraro)</a></code></pre></div> +<div class="sourceCode" id="cb831"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb831-1" data-line-number="1"><span class="kw">set.seed</span>(<span class="dv">193047</span>)</a> +<a class="sourceLine" id="cb831-2" data-line-number="2">segerstolpe <-<span class="st"> </span><span class="kw">indexCell</span>(segerstolpe)</a> +<a class="sourceLine" id="cb831-3" data-line-number="3">muraro <-<span class="st"> </span><span class="kw">indexCell</span>(muraro)</a></code></pre></div> <p>In this case the index is a series of clusterings of each cell using different sets of features, parameters k and M are the number of clusters and the number of features used in each of these subclusterings. New cells are assigned to the nearest cluster in each subclustering to generate unique pattern of cluster assignments. We then find the cell in the reference dataset with the same or most similar pattern of cluster assignments.</p> <p>We can examine the cluster assignment patterns for the reference datasets using:</p> -<div class="sourceCode" id="cb749"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb749-1" data-line-number="1"><span class="kw">metadata</span>(muraro)<span class="op">$</span>scmap_cell_index<span class="op">$</span>subclusters[<span class="dv">1</span><span class="op">:</span><span class="dv">5</span>,<span class="dv">1</span><span class="op">:</span><span class="dv">5</span>]</a></code></pre></div> +<div class="sourceCode" id="cb832"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb832-1" data-line-number="1"><span class="kw">metadata</span>(muraro)<span class="op">$</span>scmap_cell_index<span class="op">$</span>subclusters[<span class="dv">1</span><span class="op">:</span><span class="dv">5</span>,<span class="dv">1</span><span class="op">:</span><span class="dv">5</span>]</a></code></pre></div> <p>To project and find the <code>w</code> nearest neighbours we use a similar command as before:</p> -<div class="sourceCode" id="cb750"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb750-1" data-line-number="1">muraro_to_seger <-<span class="st"> </span><span class="kw">scmapCell</span>(</a> -<a class="sourceLine" id="cb750-2" data-line-number="2"> <span class="dt">projection =</span> muraro,</a> -<a class="sourceLine" id="cb750-3" data-line-number="3"> <span class="dt">index_list =</span> <span class="kw">list</span>(</a> -<a class="sourceLine" id="cb750-4" data-line-number="4"> <span class="dt">seger =</span> <span class="kw">metadata</span>(segerstolpe)<span class="op">$</span>scmap_cell_index</a> -<a class="sourceLine" id="cb750-5" data-line-number="5"> ),</a> -<a class="sourceLine" id="cb750-6" data-line-number="6"> <span class="dt">w =</span> <span class="dv">5</span></a> -<a class="sourceLine" id="cb750-7" data-line-number="7">)</a></code></pre></div> +<div class="sourceCode" id="cb833"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb833-1" data-line-number="1">muraro_to_seger <-<span class="st"> </span><span class="kw">scmapCell</span>(</a> +<a class="sourceLine" id="cb833-2" data-line-number="2"> <span class="dt">projection =</span> muraro,</a> +<a class="sourceLine" id="cb833-3" data-line-number="3"> <span class="dt">index_list =</span> <span class="kw">list</span>(</a> +<a class="sourceLine" id="cb833-4" data-line-number="4"> <span class="dt">seger =</span> <span class="kw">metadata</span>(segerstolpe)<span class="op">$</span>scmap_cell_index</a> +<a class="sourceLine" id="cb833-5" data-line-number="5"> ),</a> +<a class="sourceLine" id="cb833-6" data-line-number="6"> <span class="dt">w =</span> <span class="dv">5</span></a> +<a class="sourceLine" id="cb833-7" data-line-number="7">)</a></code></pre></div> <p>We can again look at the results:</p> -<div class="sourceCode" id="cb751"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb751-1" data-line-number="1">muraro_to_seger<span class="op">$</span>seger[[<span class="dv">1</span>]][,<span class="dv">1</span><span class="op">:</span><span class="dv">5</span>]</a></code></pre></div> +<div class="sourceCode" id="cb834"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb834-1" data-line-number="1">muraro_to_seger<span class="op">$</span>seger[[<span class="dv">1</span>]][,<span class="dv">1</span><span class="op">:</span><span class="dv">5</span>]</a></code></pre></div> <p>This shows the column number of the 5 nearest neighbours in segerstolpe to each of the cells in muraro. We could then calculate a pseudotime estimate, branch assignment, or other cell-level data by selecting the appropriate data from the colData of the segerstolpe data set. As a demonstration we will find the cell-type of the nearest neighbour of each cell.</p> -<div class="sourceCode" id="cb752"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb752-1" data-line-number="1">cell_type_NN <-<span class="st"> </span><span class="kw">colData</span>(segerstolpe)<span class="op">$</span>cell_type1[muraro_to_seger<span class="op">$</span>seger[[<span class="dv">1</span>]][<span class="dv">1</span>,]]</a> -<a class="sourceLine" id="cb752-2" data-line-number="2"><span class="kw">head</span>(cell_type_NN)</a></code></pre></div> +<div class="sourceCode" id="cb835"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb835-1" data-line-number="1">cell_type_NN <-<span class="st"> </span><span class="kw">colData</span>(segerstolpe)<span class="op">$</span>cell_type1[muraro_to_seger<span class="op">$</span>seger[[<span class="dv">1</span>]][<span class="dv">1</span>,]]</a> +<a class="sourceLine" id="cb835-2" data-line-number="2"><span class="kw">head</span>(cell_type_NN)</a></code></pre></div> </div> <div id="metaneighbour" class="section level3"> <h3><span class="header-section-number">14.0.5</span> Metaneighbour</h3> @@ -648,38 +648,38 @@ the segerstolpe data set. As a demonstration we will find the cell-type of the n versions. First is a fully supervised method which assumes cell-types are known in all datasets and calculates how “good†those cell-type labels are. (The precise meaning of “good†will be described below). Alternatively, metaneighbour can estimate how similar all cell-types are to each other both within and across datasets. We will only be using the unsupervised version as it has much more general applicability and is easier to interpret the results of.</p> <p>Metaneighbour compares cell-types across datasets by building a cell-cell spearman correlation network. The method then tries to predict the label of each cell through weighted “votes†of its nearest-neighbours. Then scores the overall similarity between two clusters as the AUROC for assigning cells of typeA to typeB based on these weighted votes. AUROC of 1 would indicate all the cells of typeA were assigned to typeB before any other cells were, and an AUROC of 0.5 is what you would get if cells were being randomly assigned.</p> <p>Metanighbour is just a couple of R functions not a complete package so we have to load them using <code>source</code></p> -<div class="sourceCode" id="cb753"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb753-1" data-line-number="1"><span class="kw">source</span>(<span class="st">"course_files/utils/2017-08-28-runMN-US.R"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb836"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb836-1" data-line-number="1"><span class="kw">source</span>(<span class="st">"course_files/utils/2017-08-28-runMN-US.R"</span>)</a></code></pre></div> <div id="prepare-data" class="section level4"> <h4><span class="header-section-number">14.0.5.1</span> Prepare Data</h4> <p>Metaneighbour requires all datasets to be combined into a single expression matrix prior to running:</p> -<div class="sourceCode" id="cb754"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb754-1" data-line-number="1">is.common <-<span class="st"> </span><span class="kw">rowData</span>(muraro)<span class="op">$</span>feature_symbol <span class="op">%in%</span><span class="st"> </span><span class="kw">rowData</span>(segerstolpe)<span class="op">$</span>feature_symbol</a> -<a class="sourceLine" id="cb754-2" data-line-number="2">muraro <-<span class="st"> </span>muraro[is.common,]</a> -<a class="sourceLine" id="cb754-3" data-line-number="3">segerstolpe <-<span class="st"> </span>segerstolpe[<span class="kw">match</span>(<span class="kw">rowData</span>(muraro)<span class="op">$</span>feature_symbol, <span class="kw">rowData</span>(segerstolpe)<span class="op">$</span>feature_symbol),]</a> -<a class="sourceLine" id="cb754-4" data-line-number="4"><span class="kw">rownames</span>(segerstolpe) <-<span class="st"> </span><span class="kw">rowData</span>(segerstolpe)<span class="op">$</span>feature_symbol</a> -<a class="sourceLine" id="cb754-5" data-line-number="5"><span class="kw">rownames</span>(muraro) <-<span class="st"> </span><span class="kw">rowData</span>(muraro)<span class="op">$</span>feature_symbol</a> -<a class="sourceLine" id="cb754-6" data-line-number="6"><span class="kw">identical</span>(<span class="kw">rownames</span>(segerstolpe), <span class="kw">rownames</span>(muraro))</a> -<a class="sourceLine" id="cb754-7" data-line-number="7"></a> -<a class="sourceLine" id="cb754-8" data-line-number="8">combined_logcounts <-<span class="st"> </span><span class="kw">cbind</span>(<span class="kw">logcounts</span>(muraro), <span class="kw">logcounts</span>(segerstolpe))</a> -<a class="sourceLine" id="cb754-9" data-line-number="9">dataset_labels <-<span class="st"> </span><span class="kw">rep</span>(<span class="kw">c</span>(<span class="st">"m"</span>, <span class="st">"s"</span>), <span class="dt">times=</span><span class="kw">c</span>(<span class="kw">ncol</span>(muraro), <span class="kw">ncol</span>(segerstolpe)))</a> -<a class="sourceLine" id="cb754-10" data-line-number="10">cell_type_labels <-<span class="st"> </span><span class="kw">c</span>(<span class="kw">colData</span>(muraro)<span class="op">$</span>cell_type1, <span class="kw">colData</span>(segerstolpe)<span class="op">$</span>cell_type1)</a> -<a class="sourceLine" id="cb754-11" data-line-number="11"></a> -<a class="sourceLine" id="cb754-12" data-line-number="12">pheno <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="dt">Sample_ID =</span> <span class="kw">colnames</span>(combined_logcounts),</a> -<a class="sourceLine" id="cb754-13" data-line-number="13"> <span class="dt">Study_ID=</span>dataset_labels,</a> -<a class="sourceLine" id="cb754-14" data-line-number="14"> <span class="dt">Celltype=</span><span class="kw">paste</span>(cell_type_labels, dataset_labels, <span class="dt">sep=</span><span class="st">"-"</span>))</a> -<a class="sourceLine" id="cb754-15" data-line-number="15"><span class="kw">rownames</span>(pheno) <-<span class="st"> </span><span class="kw">colnames</span>(combined_logcounts)</a></code></pre></div> +<div class="sourceCode" id="cb837"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb837-1" data-line-number="1">is.common <-<span class="st"> </span><span class="kw">rowData</span>(muraro)<span class="op">$</span>feature_symbol <span class="op">%in%</span><span class="st"> </span><span class="kw">rowData</span>(segerstolpe)<span class="op">$</span>feature_symbol</a> +<a class="sourceLine" id="cb837-2" data-line-number="2">muraro <-<span class="st"> </span>muraro[is.common,]</a> +<a class="sourceLine" id="cb837-3" data-line-number="3">segerstolpe <-<span class="st"> </span>segerstolpe[<span class="kw">match</span>(<span class="kw">rowData</span>(muraro)<span class="op">$</span>feature_symbol, <span class="kw">rowData</span>(segerstolpe)<span class="op">$</span>feature_symbol),]</a> +<a class="sourceLine" id="cb837-4" data-line-number="4"><span class="kw">rownames</span>(segerstolpe) <-<span class="st"> </span><span class="kw">rowData</span>(segerstolpe)<span class="op">$</span>feature_symbol</a> +<a class="sourceLine" id="cb837-5" data-line-number="5"><span class="kw">rownames</span>(muraro) <-<span class="st"> </span><span class="kw">rowData</span>(muraro)<span class="op">$</span>feature_symbol</a> +<a class="sourceLine" id="cb837-6" data-line-number="6"><span class="kw">identical</span>(<span class="kw">rownames</span>(segerstolpe), <span class="kw">rownames</span>(muraro))</a> +<a class="sourceLine" id="cb837-7" data-line-number="7"></a> +<a class="sourceLine" id="cb837-8" data-line-number="8">combined_logcounts <-<span class="st"> </span><span class="kw">cbind</span>(<span class="kw">logcounts</span>(muraro), <span class="kw">logcounts</span>(segerstolpe))</a> +<a class="sourceLine" id="cb837-9" data-line-number="9">dataset_labels <-<span class="st"> </span><span class="kw">rep</span>(<span class="kw">c</span>(<span class="st">"m"</span>, <span class="st">"s"</span>), <span class="dt">times=</span><span class="kw">c</span>(<span class="kw">ncol</span>(muraro), <span class="kw">ncol</span>(segerstolpe)))</a> +<a class="sourceLine" id="cb837-10" data-line-number="10">cell_type_labels <-<span class="st"> </span><span class="kw">c</span>(<span class="kw">colData</span>(muraro)<span class="op">$</span>cell_type1, <span class="kw">colData</span>(segerstolpe)<span class="op">$</span>cell_type1)</a> +<a class="sourceLine" id="cb837-11" data-line-number="11"></a> +<a class="sourceLine" id="cb837-12" data-line-number="12">pheno <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="dt">Sample_ID =</span> <span class="kw">colnames</span>(combined_logcounts),</a> +<a class="sourceLine" id="cb837-13" data-line-number="13"> <span class="dt">Study_ID=</span>dataset_labels,</a> +<a class="sourceLine" id="cb837-14" data-line-number="14"> <span class="dt">Celltype=</span><span class="kw">paste</span>(cell_type_labels, dataset_labels, <span class="dt">sep=</span><span class="st">"-"</span>))</a> +<a class="sourceLine" id="cb837-15" data-line-number="15"><span class="kw">rownames</span>(pheno) <-<span class="st"> </span><span class="kw">colnames</span>(combined_logcounts)</a></code></pre></div> <p>Metaneighbor includes a feature selection method to identify highly variable genes.</p> -<div class="sourceCode" id="cb755"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb755-1" data-line-number="1">var.genes =<span class="st"> </span><span class="kw">get_variable_genes</span>(combined_logcounts, pheno)</a></code></pre></div> +<div class="sourceCode" id="cb838"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb838-1" data-line-number="1">var.genes =<span class="st"> </span><span class="kw">get_variable_genes</span>(combined_logcounts, pheno)</a></code></pre></div> <p>Since Metaneighbor is much slower than <code>scmap</code>, we will down sample these datasets.</p> -<div class="sourceCode" id="cb756"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb756-1" data-line-number="1">subset <-<span class="st"> </span><span class="kw">sample</span>(<span class="dv">1</span><span class="op">:</span><span class="kw">nrow</span>(pheno), <span class="dv">2000</span>)</a> -<a class="sourceLine" id="cb756-2" data-line-number="2">combined_logcounts <-<span class="st"> </span>combined_logcounts[,subset]</a> -<a class="sourceLine" id="cb756-3" data-line-number="3">pheno <-<span class="st"> </span>pheno[subset,]</a> -<a class="sourceLine" id="cb756-4" data-line-number="4">cell_type_labels <-<span class="st"> </span>cell_type_labels[subset]</a> -<a class="sourceLine" id="cb756-5" data-line-number="5">dataset_labels <-<span class="st"> </span>dataset_labels[subset]</a></code></pre></div> +<div class="sourceCode" id="cb839"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb839-1" data-line-number="1">subset <-<span class="st"> </span><span class="kw">sample</span>(<span class="dv">1</span><span class="op">:</span><span class="kw">nrow</span>(pheno), <span class="dv">2000</span>)</a> +<a class="sourceLine" id="cb839-2" data-line-number="2">combined_logcounts <-<span class="st"> </span>combined_logcounts[,subset]</a> +<a class="sourceLine" id="cb839-3" data-line-number="3">pheno <-<span class="st"> </span>pheno[subset,]</a> +<a class="sourceLine" id="cb839-4" data-line-number="4">cell_type_labels <-<span class="st"> </span>cell_type_labels[subset]</a> +<a class="sourceLine" id="cb839-5" data-line-number="5">dataset_labels <-<span class="st"> </span>dataset_labels[subset]</a></code></pre></div> <p>Now we are ready to run Metaneighbor. First we will run the unsupervised version that will let us see which cell-types are most similar across the two datasets.</p> -<div class="sourceCode" id="cb757"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb757-1" data-line-number="1">unsup <-<span class="st"> </span><span class="kw">run_MetaNeighbor_US</span>(var.genes, combined_logcounts, <span class="kw">unique</span>(pheno<span class="op">$</span>Celltype), pheno)</a> -<a class="sourceLine" id="cb757-2" data-line-number="2"><span class="kw">heatmap</span>(unsup)</a></code></pre></div> +<div class="sourceCode" id="cb840"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb840-1" data-line-number="1">unsup <-<span class="st"> </span><span class="kw">run_MetaNeighbor_US</span>(var.genes, combined_logcounts, <span class="kw">unique</span>(pheno<span class="op">$</span>Celltype), pheno)</a> +<a class="sourceLine" id="cb840-2" data-line-number="2"><span class="kw">heatmap</span>(unsup)</a></code></pre></div> </div> </div> <div id="mnncorrect-1" class="section level3"> @@ -688,18 +688,18 @@ two datasets.</p> <p>To match individual cells to each other across datasets, mnnCorrect uses the cosine distance to avoid library-size effect then identifies mututal nearest neighbours (<code>k</code> determines to neighbourhood size) across datasets. Only overlaping biological groups should have mutual nearest neighbours (see panel b below). However, this assumes that k is set to approximately the size of the smallest biological group in the datasets, but a k that is too low will identify too few mutual nearest-neighbour pairs to get a good estimate of the batch effect we want to remove.</p> <p>Learning the biological/techncial effects is done with either singular value decomposition, similar to RUV we encounters in the batch-correction section, or with principal component analysis with the opitimized irlba package, which should be faster than SVD. The parameter <code>svd.dim</code> specifies how many dimensions should be kept to summarize the biological structure of the data, we will set it to three as we found three major groups using Metaneighbor above. These estimates may be futher adjusted by smoothing (<code>sigma</code>) and/or variance adjustment (<code>var.adj</code>).</p> <p>mnnCorrect also assumes you’ve already subset your expression matricies so that they contain identical genes in the same order, fortunately we have already done with for our datasets when we set up our data for Metaneighbor.</p> -<div class="sourceCode" id="cb758"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb758-1" data-line-number="1"><span class="kw">require</span>(<span class="st">"batchelor"</span>)</a> -<a class="sourceLine" id="cb758-2" data-line-number="2"><span class="co"># mnnCorrect will take several minutes to run</span></a> -<a class="sourceLine" id="cb758-3" data-line-number="3">corrected <-<span class="st"> </span><span class="kw">mnnCorrect</span>(<span class="kw">logcounts</span>(muraro), <span class="kw">logcounts</span>(segerstolpe), <span class="dt">k=</span><span class="dv">20</span>, <span class="dt">sigma=</span><span class="dv">1</span>, <span class="dt">pc.approx=</span><span class="ot">TRUE</span>, <span class="dt">subset.row=</span>var.genes, <span class="dt">svd.dim=</span><span class="dv">3</span>)</a></code></pre></div> +<div class="sourceCode" id="cb841"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb841-1" data-line-number="1"><span class="kw">require</span>(<span class="st">"batchelor"</span>)</a> +<a class="sourceLine" id="cb841-2" data-line-number="2"><span class="co"># mnnCorrect will take several minutes to run</span></a> +<a class="sourceLine" id="cb841-3" data-line-number="3">corrected <-<span class="st"> </span><span class="kw">mnnCorrect</span>(<span class="kw">logcounts</span>(muraro), <span class="kw">logcounts</span>(segerstolpe), <span class="dt">k=</span><span class="dv">20</span>, <span class="dt">sigma=</span><span class="dv">1</span>, <span class="dt">pc.approx=</span><span class="ot">TRUE</span>, <span class="dt">subset.row=</span>var.genes, <span class="dt">svd.dim=</span><span class="dv">3</span>)</a></code></pre></div> <p>First let’s check that we found a sufficient number of mnn pairs, mnnCorrect returns a list of dataframe with the mnn pairs for each dataset.</p> -<div class="sourceCode" id="cb759"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb759-1" data-line-number="1"><span class="kw">dim</span>(corrected<span class="op">$</span>pairs[[<span class="dv">1</span>]]) <span class="co"># muraro -> others</span></a> -<a class="sourceLine" id="cb759-2" data-line-number="2"><span class="kw">dim</span>(corrected<span class="op">$</span>pairs[[<span class="dv">2</span>]]) <span class="co"># seger -> others</span></a></code></pre></div> +<div class="sourceCode" id="cb842"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb842-1" data-line-number="1"><span class="kw">dim</span>(corrected<span class="op">$</span>pairs[[<span class="dv">1</span>]]) <span class="co"># muraro -> others</span></a> +<a class="sourceLine" id="cb842-2" data-line-number="2"><span class="kw">dim</span>(corrected<span class="op">$</span>pairs[[<span class="dv">2</span>]]) <span class="co"># seger -> others</span></a></code></pre></div> <p>The first and second columns contain the cell column IDs and the third column contains a number indicating which dataset/batch the column 2 cell belongs to. In our case, we are only comparing two datasets so all the mnn pairs have been assigned to the second table and the third column contains only ones</p> -<div class="sourceCode" id="cb760"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb760-1" data-line-number="1"><span class="kw">head</span>(corrected<span class="op">$</span>pairs[[<span class="dv">2</span>]])</a> -<a class="sourceLine" id="cb760-2" data-line-number="2">total_pairs <-<span class="st"> </span><span class="kw">nrow</span>(corrected<span class="op">$</span>pairs[[<span class="dv">2</span>]])</a> -<a class="sourceLine" id="cb760-3" data-line-number="3">n_unique_seger <-<span class="st"> </span><span class="kw">length</span>(<span class="kw">unique</span>((corrected<span class="op">$</span>pairs[[<span class="dv">2</span>]][,<span class="dv">1</span>])))</a> -<a class="sourceLine" id="cb760-4" data-line-number="4">n_unique_muraro <-<span class="st"> </span><span class="kw">length</span>(<span class="kw">unique</span>((corrected<span class="op">$</span>pairs[[<span class="dv">2</span>]][,<span class="dv">2</span>])))</a></code></pre></div> +<div class="sourceCode" id="cb843"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb843-1" data-line-number="1"><span class="kw">head</span>(corrected<span class="op">$</span>pairs[[<span class="dv">2</span>]])</a> +<a class="sourceLine" id="cb843-2" data-line-number="2">total_pairs <-<span class="st"> </span><span class="kw">nrow</span>(corrected<span class="op">$</span>pairs[[<span class="dv">2</span>]])</a> +<a class="sourceLine" id="cb843-3" data-line-number="3">n_unique_seger <-<span class="st"> </span><span class="kw">length</span>(<span class="kw">unique</span>((corrected<span class="op">$</span>pairs[[<span class="dv">2</span>]][,<span class="dv">1</span>])))</a> +<a class="sourceLine" id="cb843-4" data-line-number="4">n_unique_muraro <-<span class="st"> </span><span class="kw">length</span>(<span class="kw">unique</span>((corrected<span class="op">$</span>pairs[[<span class="dv">2</span>]][,<span class="dv">2</span>])))</a></code></pre></div> <p>mnnCorrect found “r total_pairs†sets of mutual nearest-neighbours between <code>n_unique_seger</code> segerstolpe cells and <code>n_unique_muraro</code> muraro cells. This should be a sufficient number of pairs but the low number of unique cells in @@ -709,15 +709,15 @@ each dataset.</p> Which cell-types had mnns across these datasets? Should we increase/decrease k?</p> <p><strong>Answer</strong></p> <p>Now we could create a combined dataset to jointly analyse these data. However, the corrected data is no longer counts and usually will contain negative expression values thus some analysis tools may no longer be appropriate. For simplicity let’s just plot a joint TSNE.</p> -<div class="sourceCode" id="cb761"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb761-1" data-line-number="1"><span class="kw">require</span>(<span class="st">"Rtsne"</span>)</a> -<a class="sourceLine" id="cb761-2" data-line-number="2">joint_expression_matrix <-<span class="st"> </span><span class="kw">cbind</span>(corrected<span class="op">$</span>corrected[[<span class="dv">1</span>]], corrected<span class="op">$</span>corrected[[<span class="dv">2</span>]])</a> -<a class="sourceLine" id="cb761-3" data-line-number="3"></a> -<a class="sourceLine" id="cb761-4" data-line-number="4"><span class="co"># Tsne will take some time to run on the full dataset</span></a> -<a class="sourceLine" id="cb761-5" data-line-number="5">joint_tsne <-<span class="st"> </span><span class="kw">Rtsne</span>(<span class="kw">t</span>(joint_expression_matrix[<span class="kw">rownames</span>(joint_expression_matrix) <span class="op">%in%</span><span class="st"> </span>var.genes,]), <span class="dt">initial_dims=</span><span class="dv">10</span>, <span class="dt">theta=</span><span class="fl">0.75</span>,</a> -<a class="sourceLine" id="cb761-6" data-line-number="6"> <span class="dt">check_duplicates=</span><span class="ot">FALSE</span>, <span class="dt">max_iter=</span><span class="dv">200</span>, <span class="dt">stop_lying_iter=</span><span class="dv">50</span>, <span class="dt">mom_switch_iter=</span><span class="dv">50</span>)</a> -<a class="sourceLine" id="cb761-7" data-line-number="7">dataset_labels <-<span class="st"> </span><span class="kw">factor</span>(<span class="kw">rep</span>(<span class="kw">c</span>(<span class="st">"m"</span>, <span class="st">"s"</span>), <span class="dt">times=</span><span class="kw">c</span>(<span class="kw">ncol</span>(muraro), <span class="kw">ncol</span>(segerstolpe))))</a> -<a class="sourceLine" id="cb761-8" data-line-number="8">cell_type_labels <-<span class="st"> </span><span class="kw">factor</span>(<span class="kw">c</span>(<span class="kw">colData</span>(muraro)<span class="op">$</span>cell_type1, <span class="kw">colData</span>(segerstolpe)<span class="op">$</span>cell_type1))</a> -<a class="sourceLine" id="cb761-9" data-line-number="9"><span class="kw">plot</span>(joint_tsne<span class="op">$</span>Y[,<span class="dv">1</span>], joint_tsne<span class="op">$</span>Y[,<span class="dv">2</span>], <span class="dt">pch=</span><span class="kw">c</span>(<span class="dv">16</span>,<span class="dv">1</span>)[dataset_labels], <span class="dt">col=</span><span class="kw">rainbow</span>(<span class="kw">length</span>(<span class="kw">levels</span>(cell_type_labels)))[cell_type_labels])</a></code></pre></div> +<div class="sourceCode" id="cb844"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb844-1" data-line-number="1"><span class="kw">require</span>(<span class="st">"Rtsne"</span>)</a> +<a class="sourceLine" id="cb844-2" data-line-number="2">joint_expression_matrix <-<span class="st"> </span><span class="kw">cbind</span>(corrected<span class="op">$</span>corrected[[<span class="dv">1</span>]], corrected<span class="op">$</span>corrected[[<span class="dv">2</span>]])</a> +<a class="sourceLine" id="cb844-3" data-line-number="3"></a> +<a class="sourceLine" id="cb844-4" data-line-number="4"><span class="co"># Tsne will take some time to run on the full dataset</span></a> +<a class="sourceLine" id="cb844-5" data-line-number="5">joint_tsne <-<span class="st"> </span><span class="kw">Rtsne</span>(<span class="kw">t</span>(joint_expression_matrix[<span class="kw">rownames</span>(joint_expression_matrix) <span class="op">%in%</span><span class="st"> </span>var.genes,]), <span class="dt">initial_dims=</span><span class="dv">10</span>, <span class="dt">theta=</span><span class="fl">0.75</span>,</a> +<a class="sourceLine" id="cb844-6" data-line-number="6"> <span class="dt">check_duplicates=</span><span class="ot">FALSE</span>, <span class="dt">max_iter=</span><span class="dv">200</span>, <span class="dt">stop_lying_iter=</span><span class="dv">50</span>, <span class="dt">mom_switch_iter=</span><span class="dv">50</span>)</a> +<a class="sourceLine" id="cb844-7" data-line-number="7">dataset_labels <-<span class="st"> </span><span class="kw">factor</span>(<span class="kw">rep</span>(<span class="kw">c</span>(<span class="st">"m"</span>, <span class="st">"s"</span>), <span class="dt">times=</span><span class="kw">c</span>(<span class="kw">ncol</span>(muraro), <span class="kw">ncol</span>(segerstolpe))))</a> +<a class="sourceLine" id="cb844-8" data-line-number="8">cell_type_labels <-<span class="st"> </span><span class="kw">factor</span>(<span class="kw">c</span>(<span class="kw">colData</span>(muraro)<span class="op">$</span>cell_type1, <span class="kw">colData</span>(segerstolpe)<span class="op">$</span>cell_type1))</a> +<a class="sourceLine" id="cb844-9" data-line-number="9"><span class="kw">plot</span>(joint_tsne<span class="op">$</span>Y[,<span class="dv">1</span>], joint_tsne<span class="op">$</span>Y[,<span class="dv">2</span>], <span class="dt">pch=</span><span class="kw">c</span>(<span class="dv">16</span>,<span class="dv">1</span>)[dataset_labels], <span class="dt">col=</span><span class="kw">rainbow</span>(<span class="kw">length</span>(<span class="kw">levels</span>(cell_type_labels)))[cell_type_labels])</a></code></pre></div> </div> <div id="cannonical-correlation-analysis-seurat" class="section level3"> <h3><span class="header-section-number">14.0.7</span> Cannonical Correlation Analysis (Seurat)</h3> @@ -725,58 +725,58 @@ Which cell-types had mnns across these datasets? Should we increase/decrease k?< <p>Seurat uses gene-gene correlations to identify the biological structure in the dataset with a method called canonical correlation analysis (CCA). Seurat learns the shared structure to the gene-gene correlations and then evaluates how well each cell fits this structure. Cells which must better described by a data-specific dimensionality reduction method than by the shared correlation structure are assumed to represent dataset-specific cell-types/states and are discarded before aligning the two datasets. Finally the two datasets are aligned using ‘warping’ algorithms which normalize the low-dimensional representations of each dataset in a way that is robust to differences in population density.</p> <p>Note because Seurat uses up a lot of library space you will have to restart your R-session to load it, and the plots/output won’t be automatically generated on this page.</p> <p>Reload the data:</p> -<div class="sourceCode" id="cb762"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb762-1" data-line-number="1">muraro <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/pancreas/muraro.rds"</span>)</a> -<a class="sourceLine" id="cb762-2" data-line-number="2">segerstolpe <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/pancreas/segerstolpe.rds"</span>)</a> -<a class="sourceLine" id="cb762-3" data-line-number="3">segerstolpe <-<span class="st"> </span>segerstolpe[,<span class="kw">colData</span>(segerstolpe)<span class="op">$</span>cell_type1 <span class="op">!=</span><span class="st"> "unclassified"</span>]</a> -<a class="sourceLine" id="cb762-4" data-line-number="4">segerstolpe <-<span class="st"> </span>segerstolpe[,<span class="kw">colData</span>(segerstolpe)<span class="op">$</span>cell_type1 <span class="op">!=</span><span class="st"> "not applicable"</span>,]</a> -<a class="sourceLine" id="cb762-5" data-line-number="5">muraro <-<span class="st"> </span>muraro[,<span class="kw">colData</span>(muraro)<span class="op">$</span>cell_type1 <span class="op">!=</span><span class="st"> "unclear"</span>]</a> -<a class="sourceLine" id="cb762-6" data-line-number="6">is.common <-<span class="st"> </span><span class="kw">rowData</span>(muraro)<span class="op">$</span>feature_symbol <span class="op">%in%</span><span class="st"> </span><span class="kw">rowData</span>(segerstolpe)<span class="op">$</span>feature_symbol</a> -<a class="sourceLine" id="cb762-7" data-line-number="7">muraro <-<span class="st"> </span>muraro[is.common,]</a> -<a class="sourceLine" id="cb762-8" data-line-number="8">segerstolpe <-<span class="st"> </span>segerstolpe[<span class="kw">match</span>(<span class="kw">rowData</span>(muraro)<span class="op">$</span>feature_symbol, <span class="kw">rowData</span>(segerstolpe)<span class="op">$</span>feature_symbol),]</a> -<a class="sourceLine" id="cb762-9" data-line-number="9"><span class="kw">rownames</span>(segerstolpe) <-<span class="st"> </span><span class="kw">rowData</span>(segerstolpe)<span class="op">$</span>feature_symbol</a> -<a class="sourceLine" id="cb762-10" data-line-number="10"><span class="kw">rownames</span>(muraro) <-<span class="st"> </span><span class="kw">rowData</span>(muraro)<span class="op">$</span>feature_symbol</a> -<a class="sourceLine" id="cb762-11" data-line-number="11"><span class="kw">identical</span>(<span class="kw">rownames</span>(segerstolpe), <span class="kw">rownames</span>(muraro))</a></code></pre></div> +<div class="sourceCode" id="cb845"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb845-1" data-line-number="1">muraro <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/pancreas/muraro.rds"</span>)</a> +<a class="sourceLine" id="cb845-2" data-line-number="2">segerstolpe <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/pancreas/segerstolpe.rds"</span>)</a> +<a class="sourceLine" id="cb845-3" data-line-number="3">segerstolpe <-<span class="st"> </span>segerstolpe[,<span class="kw">colData</span>(segerstolpe)<span class="op">$</span>cell_type1 <span class="op">!=</span><span class="st"> "unclassified"</span>]</a> +<a class="sourceLine" id="cb845-4" data-line-number="4">segerstolpe <-<span class="st"> </span>segerstolpe[,<span class="kw">colData</span>(segerstolpe)<span class="op">$</span>cell_type1 <span class="op">!=</span><span class="st"> "not applicable"</span>,]</a> +<a class="sourceLine" id="cb845-5" data-line-number="5">muraro <-<span class="st"> </span>muraro[,<span class="kw">colData</span>(muraro)<span class="op">$</span>cell_type1 <span class="op">!=</span><span class="st"> "unclear"</span>]</a> +<a class="sourceLine" id="cb845-6" data-line-number="6">is.common <-<span class="st"> </span><span class="kw">rowData</span>(muraro)<span class="op">$</span>feature_symbol <span class="op">%in%</span><span class="st"> </span><span class="kw">rowData</span>(segerstolpe)<span class="op">$</span>feature_symbol</a> +<a class="sourceLine" id="cb845-7" data-line-number="7">muraro <-<span class="st"> </span>muraro[is.common,]</a> +<a class="sourceLine" id="cb845-8" data-line-number="8">segerstolpe <-<span class="st"> </span>segerstolpe[<span class="kw">match</span>(<span class="kw">rowData</span>(muraro)<span class="op">$</span>feature_symbol, <span class="kw">rowData</span>(segerstolpe)<span class="op">$</span>feature_symbol),]</a> +<a class="sourceLine" id="cb845-9" data-line-number="9"><span class="kw">rownames</span>(segerstolpe) <-<span class="st"> </span><span class="kw">rowData</span>(segerstolpe)<span class="op">$</span>feature_symbol</a> +<a class="sourceLine" id="cb845-10" data-line-number="10"><span class="kw">rownames</span>(muraro) <-<span class="st"> </span><span class="kw">rowData</span>(muraro)<span class="op">$</span>feature_symbol</a> +<a class="sourceLine" id="cb845-11" data-line-number="11"><span class="kw">identical</span>(<span class="kw">rownames</span>(segerstolpe), <span class="kw">rownames</span>(muraro))</a></code></pre></div> <p>First we will reformat our data into Seurat objects:</p> -<div class="sourceCode" id="cb763"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb763-1" data-line-number="1"><span class="kw">require</span>(<span class="st">"Seurat"</span>)</a> -<a class="sourceLine" id="cb763-2" data-line-number="2"><span class="kw">set.seed</span>(<span class="dv">4719364</span>)</a> -<a class="sourceLine" id="cb763-3" data-line-number="3">muraro_seurat <-<span class="st"> </span><span class="kw">CreateSeuratObject</span>(<span class="dt">raw.data=</span><span class="kw">assays</span>(muraro)[[<span class="st">"normcounts"</span>]]) <span class="co"># raw counts aren't available for muraro</span></a> -<a class="sourceLine" id="cb763-4" data-line-number="4">muraro_seurat<span class="op">@</span>meta.data[, <span class="st">"dataset"</span>] <-<span class="st"> </span><span class="dv">1</span></a> -<a class="sourceLine" id="cb763-5" data-line-number="5">muraro_seurat<span class="op">@</span>meta.data[, <span class="st">"celltype"</span>] <-<span class="st"> </span><span class="kw">paste</span>(<span class="st">"m"</span>,<span class="kw">colData</span>(muraro)<span class="op">$</span>cell_type1, <span class="dt">sep=</span><span class="st">"-"</span>)</a> -<a class="sourceLine" id="cb763-6" data-line-number="6"></a> -<a class="sourceLine" id="cb763-7" data-line-number="7">seger_seurat <-<span class="st"> </span><span class="kw">CreateSeuratObject</span>(<span class="dt">raw.data=</span><span class="kw">assays</span>(segerstolpe)[[<span class="st">"counts"</span>]])</a> -<a class="sourceLine" id="cb763-8" data-line-number="8">seger_seurat<span class="op">@</span>meta.data[, <span class="st">"dataset"</span>] <-<span class="st"> </span><span class="dv">2</span></a> -<a class="sourceLine" id="cb763-9" data-line-number="9">seger_seurat<span class="op">@</span>meta.data[, <span class="st">"celltype"</span>] <-<span class="st"> </span><span class="kw">paste</span>(<span class="st">"s"</span>,<span class="kw">colData</span>(segerstolpe)<span class="op">$</span>cell_type1, <span class="dt">sep=</span><span class="st">"-"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb846"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb846-1" data-line-number="1"><span class="kw">require</span>(<span class="st">"Seurat"</span>)</a> +<a class="sourceLine" id="cb846-2" data-line-number="2"><span class="kw">set.seed</span>(<span class="dv">4719364</span>)</a> +<a class="sourceLine" id="cb846-3" data-line-number="3">muraro_seurat <-<span class="st"> </span><span class="kw">CreateSeuratObject</span>(<span class="dt">raw.data=</span><span class="kw">assays</span>(muraro)[[<span class="st">"normcounts"</span>]]) <span class="co"># raw counts aren't available for muraro</span></a> +<a class="sourceLine" id="cb846-4" data-line-number="4">muraro_seurat<span class="op">@</span>meta.data[, <span class="st">"dataset"</span>] <-<span class="st"> </span><span class="dv">1</span></a> +<a class="sourceLine" id="cb846-5" data-line-number="5">muraro_seurat<span class="op">@</span>meta.data[, <span class="st">"celltype"</span>] <-<span class="st"> </span><span class="kw">paste</span>(<span class="st">"m"</span>,<span class="kw">colData</span>(muraro)<span class="op">$</span>cell_type1, <span class="dt">sep=</span><span class="st">"-"</span>)</a> +<a class="sourceLine" id="cb846-6" data-line-number="6"></a> +<a class="sourceLine" id="cb846-7" data-line-number="7">seger_seurat <-<span class="st"> </span><span class="kw">CreateSeuratObject</span>(<span class="dt">raw.data=</span><span class="kw">assays</span>(segerstolpe)[[<span class="st">"counts"</span>]])</a> +<a class="sourceLine" id="cb846-8" data-line-number="8">seger_seurat<span class="op">@</span>meta.data[, <span class="st">"dataset"</span>] <-<span class="st"> </span><span class="dv">2</span></a> +<a class="sourceLine" id="cb846-9" data-line-number="9">seger_seurat<span class="op">@</span>meta.data[, <span class="st">"celltype"</span>] <-<span class="st"> </span><span class="kw">paste</span>(<span class="st">"s"</span>,<span class="kw">colData</span>(segerstolpe)<span class="op">$</span>cell_type1, <span class="dt">sep=</span><span class="st">"-"</span>)</a></code></pre></div> <p>Next we must normalize, scale and identify highly variable genes for each dataset:</p> -<div class="sourceCode" id="cb764"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb764-1" data-line-number="1">muraro_seurat <-<span class="st"> </span><span class="kw">NormalizeData</span>(<span class="dt">object=</span>muraro_seurat)</a> -<a class="sourceLine" id="cb764-2" data-line-number="2">muraro_seurat <-<span class="st"> </span><span class="kw">ScaleData</span>(<span class="dt">object=</span>muraro_seurat)</a> -<a class="sourceLine" id="cb764-3" data-line-number="3">muraro_seurat <-<span class="st"> </span><span class="kw">FindVariableGenes</span>(<span class="dt">object=</span>muraro_seurat, <span class="dt">do.plot=</span><span class="ot">TRUE</span>)</a> -<a class="sourceLine" id="cb764-4" data-line-number="4"></a> -<a class="sourceLine" id="cb764-5" data-line-number="5">seger_seurat <-<span class="st"> </span><span class="kw">NormalizeData</span>(<span class="dt">object=</span>seger_seurat)</a> -<a class="sourceLine" id="cb764-6" data-line-number="6">seger_seurat <-<span class="st"> </span><span class="kw">ScaleData</span>(<span class="dt">object=</span>seger_seurat)</a> -<a class="sourceLine" id="cb764-7" data-line-number="7">seger_seurat <-<span class="st"> </span><span class="kw">FindVariableGenes</span>(<span class="dt">object=</span>seger_seurat, <span class="dt">do.plot=</span><span class="ot">TRUE</span>)</a></code></pre></div> +<div class="sourceCode" id="cb847"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb847-1" data-line-number="1">muraro_seurat <-<span class="st"> </span><span class="kw">NormalizeData</span>(<span class="dt">object=</span>muraro_seurat)</a> +<a class="sourceLine" id="cb847-2" data-line-number="2">muraro_seurat <-<span class="st"> </span><span class="kw">ScaleData</span>(<span class="dt">object=</span>muraro_seurat)</a> +<a class="sourceLine" id="cb847-3" data-line-number="3">muraro_seurat <-<span class="st"> </span><span class="kw">FindVariableGenes</span>(<span class="dt">object=</span>muraro_seurat, <span class="dt">do.plot=</span><span class="ot">TRUE</span>)</a> +<a class="sourceLine" id="cb847-4" data-line-number="4"></a> +<a class="sourceLine" id="cb847-5" data-line-number="5">seger_seurat <-<span class="st"> </span><span class="kw">NormalizeData</span>(<span class="dt">object=</span>seger_seurat)</a> +<a class="sourceLine" id="cb847-6" data-line-number="6">seger_seurat <-<span class="st"> </span><span class="kw">ScaleData</span>(<span class="dt">object=</span>seger_seurat)</a> +<a class="sourceLine" id="cb847-7" data-line-number="7">seger_seurat <-<span class="st"> </span><span class="kw">FindVariableGenes</span>(<span class="dt">object=</span>seger_seurat, <span class="dt">do.plot=</span><span class="ot">TRUE</span>)</a></code></pre></div> <p>Eventhough Seurat corrects for the relationship between dispersion and mean expression, it doesn’t use the corrected value when ranking features. Compare the results of the command below with the results in the plots above:</p> -<div class="sourceCode" id="cb765"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb765-1" data-line-number="1"><span class="kw">head</span>(muraro_seurat<span class="op">@</span>hvg.info, <span class="dv">50</span>)</a> -<a class="sourceLine" id="cb765-2" data-line-number="2"><span class="kw">head</span>(seger_seurat<span class="op">@</span>hvg.info, <span class="dv">50</span>)</a></code></pre></div> +<div class="sourceCode" id="cb848"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb848-1" data-line-number="1"><span class="kw">head</span>(muraro_seurat<span class="op">@</span>hvg.info, <span class="dv">50</span>)</a> +<a class="sourceLine" id="cb848-2" data-line-number="2"><span class="kw">head</span>(seger_seurat<span class="op">@</span>hvg.info, <span class="dv">50</span>)</a></code></pre></div> <p>But we will follow their example and use the top 2000 most dispersed genes withmerged_seurat correcting for mean expression from each dataset anyway.</p> -<div class="sourceCode" id="cb766"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb766-1" data-line-number="1">gene.use <-<span class="st"> </span><span class="kw">union</span>(<span class="kw">rownames</span>(<span class="dt">x =</span> <span class="kw">head</span>(<span class="dt">x =</span> muraro_seurat<span class="op">@</span>hvg.info, <span class="dt">n =</span> <span class="dv">2000</span>)),</a> -<a class="sourceLine" id="cb766-2" data-line-number="2"> <span class="kw">rownames</span>(<span class="dt">x =</span> <span class="kw">head</span>(<span class="dt">x =</span> seger_seurat<span class="op">@</span>hvg.info, <span class="dt">n =</span> <span class="dv">2000</span>)))</a></code></pre></div> +<div class="sourceCode" id="cb849"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb849-1" data-line-number="1">gene.use <-<span class="st"> </span><span class="kw">union</span>(<span class="kw">rownames</span>(<span class="dt">x =</span> <span class="kw">head</span>(<span class="dt">x =</span> muraro_seurat<span class="op">@</span>hvg.info, <span class="dt">n =</span> <span class="dv">2000</span>)),</a> +<a class="sourceLine" id="cb849-2" data-line-number="2"> <span class="kw">rownames</span>(<span class="dt">x =</span> <span class="kw">head</span>(<span class="dt">x =</span> seger_seurat<span class="op">@</span>hvg.info, <span class="dt">n =</span> <span class="dv">2000</span>)))</a></code></pre></div> <p><strong>Exercise</strong> Find the features we would use if we selected the top 2000 most dispersed after scaling by mean. (Hint: consider the <code>order</code> function)</p> <p><strong>Answer</strong></p> <p>Now we will run CCA to find the shared correlation structure for these two datasets:</p> <p>Note to speed up the calculations we will be using only the top 5 dimensions but ideally you would consider many more and then select the top most informative ones using <code>DimHeatmap</code>.</p> -<div class="sourceCode" id="cb767"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb767-1" data-line-number="1">merged_seurat <-<span class="st"> </span><span class="kw">RunCCA</span>(<span class="dt">object=</span>muraro_seurat, <span class="dt">object2=</span>seger_seurat, <span class="dt">genes.use=</span>gene.use, <span class="dt">add.cell.id1=</span><span class="st">"m"</span>, <span class="dt">add.cell.id2=</span><span class="st">"s"</span>, <span class="dt">num.cc =</span> <span class="dv">5</span>)</a> -<a class="sourceLine" id="cb767-2" data-line-number="2"><span class="kw">DimPlot</span>(<span class="dt">object =</span> merged_seurat, <span class="dt">reduction.use =</span> <span class="st">"cca"</span>, <span class="dt">group.by =</span> <span class="st">"dataset"</span>, <span class="dt">pt.size =</span> <span class="fl">0.5</span>) <span class="co"># Before correcting</span></a></code></pre></div> +<div class="sourceCode" id="cb850"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb850-1" data-line-number="1">merged_seurat <-<span class="st"> </span><span class="kw">RunCCA</span>(<span class="dt">object=</span>muraro_seurat, <span class="dt">object2=</span>seger_seurat, <span class="dt">genes.use=</span>gene.use, <span class="dt">add.cell.id1=</span><span class="st">"m"</span>, <span class="dt">add.cell.id2=</span><span class="st">"s"</span>, <span class="dt">num.cc =</span> <span class="dv">5</span>)</a> +<a class="sourceLine" id="cb850-2" data-line-number="2"><span class="kw">DimPlot</span>(<span class="dt">object =</span> merged_seurat, <span class="dt">reduction.use =</span> <span class="st">"cca"</span>, <span class="dt">group.by =</span> <span class="st">"dataset"</span>, <span class="dt">pt.size =</span> <span class="fl">0.5</span>) <span class="co"># Before correcting</span></a></code></pre></div> <p>To identify dataset specific cell-types we compare how well cells are ‘explained’ by CCA vs dataset-specific principal component analysis.</p> -<div class="sourceCode" id="cb768"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb768-1" data-line-number="1">merged_seurat <-<span class="st"> </span><span class="kw">CalcVarExpRatio</span>(<span class="dt">object =</span> merged_seurat, <span class="dt">reduction.type =</span> <span class="st">"pca"</span>, <span class="dt">grouping.var =</span> <span class="st">"dataset"</span>, <span class="dt">dims.use =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">5</span>)</a> -<a class="sourceLine" id="cb768-2" data-line-number="2">merged.all <-<span class="st"> </span>merged_seurat</a> -<a class="sourceLine" id="cb768-3" data-line-number="3">merged_seurat <-<span class="st"> </span><span class="kw">SubsetData</span>(<span class="dt">object=</span>merged_seurat, <span class="dt">subset.name=</span><span class="st">"var.ratio.pca"</span>, <span class="dt">accept.low =</span> <span class="fl">0.5</span>) <span class="co"># CCA > 1/2 as good as PCA</span></a> -<a class="sourceLine" id="cb768-4" data-line-number="4">merged.discard <-<span class="st"> </span><span class="kw">SubsetData</span>(<span class="dt">object=</span>merged.all, <span class="dt">subset.name=</span><span class="st">"var.ratio.pca"</span>, <span class="dt">accept.high =</span> <span class="fl">0.5</span>)</a> -<a class="sourceLine" id="cb768-5" data-line-number="5"></a> -<a class="sourceLine" id="cb768-6" data-line-number="6"><span class="kw">summary</span>(<span class="kw">factor</span>(merged.discard<span class="op">@</span>meta.data<span class="op">$</span>celltype)) <span class="co"># check the cell-type of the discarded cells.</span></a></code></pre></div> +<div class="sourceCode" id="cb851"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb851-1" data-line-number="1">merged_seurat <-<span class="st"> </span><span class="kw">CalcVarExpRatio</span>(<span class="dt">object =</span> merged_seurat, <span class="dt">reduction.type =</span> <span class="st">"pca"</span>, <span class="dt">grouping.var =</span> <span class="st">"dataset"</span>, <span class="dt">dims.use =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">5</span>)</a> +<a class="sourceLine" id="cb851-2" data-line-number="2">merged.all <-<span class="st"> </span>merged_seurat</a> +<a class="sourceLine" id="cb851-3" data-line-number="3">merged_seurat <-<span class="st"> </span><span class="kw">SubsetData</span>(<span class="dt">object=</span>merged_seurat, <span class="dt">subset.name=</span><span class="st">"var.ratio.pca"</span>, <span class="dt">accept.low =</span> <span class="fl">0.5</span>) <span class="co"># CCA > 1/2 as good as PCA</span></a> +<a class="sourceLine" id="cb851-4" data-line-number="4">merged.discard <-<span class="st"> </span><span class="kw">SubsetData</span>(<span class="dt">object=</span>merged.all, <span class="dt">subset.name=</span><span class="st">"var.ratio.pca"</span>, <span class="dt">accept.high =</span> <span class="fl">0.5</span>)</a> +<a class="sourceLine" id="cb851-5" data-line-number="5"></a> +<a class="sourceLine" id="cb851-6" data-line-number="6"><span class="kw">summary</span>(<span class="kw">factor</span>(merged.discard<span class="op">@</span>meta.data<span class="op">$</span>celltype)) <span class="co"># check the cell-type of the discarded cells.</span></a></code></pre></div> <p>Here we can see that despite both datasets containing endothelial cells, almost all of them have been discarded as “dataset-specificâ€. Now we can align the datasets:</p> -<div class="sourceCode" id="cb769"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb769-1" data-line-number="1">merged_seurat <-<span class="st"> </span><span class="kw">AlignSubspace</span>(<span class="dt">object =</span> merged_seurat, <span class="dt">reduction.type =</span> <span class="st">"cca"</span>, <span class="dt">grouping.var =</span> <span class="st">"dataset"</span>, <span class="dt">dims.align =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">5</span>)</a> -<a class="sourceLine" id="cb769-2" data-line-number="2"><span class="kw">DimPlot</span>(<span class="dt">object =</span> merged_seurat, <span class="dt">reduction.use =</span> <span class="st">"cca.aligned"</span>, <span class="dt">group.by =</span> <span class="st">"dataset"</span>, <span class="dt">pt.size =</span> <span class="fl">0.5</span>) <span class="co"># After aligning subspaces</span></a></code></pre></div> +<div class="sourceCode" id="cb852"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb852-1" data-line-number="1">merged_seurat <-<span class="st"> </span><span class="kw">AlignSubspace</span>(<span class="dt">object =</span> merged_seurat, <span class="dt">reduction.type =</span> <span class="st">"cca"</span>, <span class="dt">grouping.var =</span> <span class="st">"dataset"</span>, <span class="dt">dims.align =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">5</span>)</a> +<a class="sourceLine" id="cb852-2" data-line-number="2"><span class="kw">DimPlot</span>(<span class="dt">object =</span> merged_seurat, <span class="dt">reduction.use =</span> <span class="st">"cca.aligned"</span>, <span class="dt">group.by =</span> <span class="st">"dataset"</span>, <span class="dt">pt.size =</span> <span class="fl">0.5</span>) <span class="co"># After aligning subspaces</span></a></code></pre></div> <p><strong>Exercise</strong> Compare the results for if you use the features after scaling dispersions.</p> <p><strong>Answer</strong></p> @@ -789,10 +789,10 @@ Use the clustering methods we previously covered on the combined datasets. Do yo </div> <div id="search-scrna-seq-data" class="section level2"> <h2><span class="header-section-number">14.1</span> Search scRNA-Seq data</h2> -<div class="sourceCode" id="cb770"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb770-1" data-line-number="1"><span class="kw">library</span>(scfind)</a> -<a class="sourceLine" id="cb770-2" data-line-number="2"><span class="kw">library</span>(SingleCellExperiment)</a> -<a class="sourceLine" id="cb770-3" data-line-number="3"><span class="kw">library</span>(plotly)</a> -<a class="sourceLine" id="cb770-4" data-line-number="4"><span class="kw">set.seed</span>(<span class="dv">1234567</span>)</a></code></pre></div> +<div class="sourceCode" id="cb853"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb853-1" data-line-number="1"><span class="kw">library</span>(scfind)</a> +<a class="sourceLine" id="cb853-2" data-line-number="2"><span class="kw">library</span>(SingleCellExperiment)</a> +<a class="sourceLine" id="cb853-3" data-line-number="3"><span class="kw">library</span>(plotly)</a> +<a class="sourceLine" id="cb853-4" data-line-number="4"><span class="kw">set.seed</span>(<span class="dv">1234567</span>)</a></code></pre></div> <div id="about" class="section level3"> <h3><span class="header-section-number">14.1.1</span> About</h3> <p><code>scfind</code> is a tool that allows one to search single cell RNA-Seq collections @@ -806,17 +806,17 @@ package</a>. Cloud implementation of <h3><span class="header-section-number">14.1.2</span> Dataset</h3> <p>We will run <code>scfind</code> on the Tabula Muris 10X dataset. <code>scfind</code> also operates on <code>SingleCellExperiment</code> class:</p> -<div class="sourceCode" id="cb771"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb771-1" data-line-number="1">tm10x_heart <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/sce/Heart_10X.rds"</span>)</a> -<a class="sourceLine" id="cb771-2" data-line-number="2">tm10x_heart</a> -<a class="sourceLine" id="cb771-3" data-line-number="3"><span class="kw">colData</span>(tm10x_heart)</a></code></pre></div> +<div class="sourceCode" id="cb854"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb854-1" data-line-number="1">tm10x_heart <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/sce/Heart_10X.rds"</span>)</a> +<a class="sourceLine" id="cb854-2" data-line-number="2">tm10x_heart</a> +<a class="sourceLine" id="cb854-3" data-line-number="3"><span class="kw">colData</span>(tm10x_heart)</a></code></pre></div> </div> <div id="gene-index" class="section level3"> <h3><span class="header-section-number">14.1.3</span> Gene Index</h3> <p>Now we need to create a gene index using our dataset:</p> -<div class="sourceCode" id="cb772"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb772-1" data-line-number="1">heart_index <-<span class="st"> </span><span class="kw">buildCellTypeIndex</span>(</a> -<a class="sourceLine" id="cb772-2" data-line-number="2"> tm10x_heart,</a> -<a class="sourceLine" id="cb772-3" data-line-number="3"> <span class="dt">cell_type_column =</span> <span class="st">"cell_type1"</span></a> -<a class="sourceLine" id="cb772-4" data-line-number="4">)</a></code></pre></div> +<div class="sourceCode" id="cb855"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb855-1" data-line-number="1">heart_index <-<span class="st"> </span><span class="kw">buildCellTypeIndex</span>(</a> +<a class="sourceLine" id="cb855-2" data-line-number="2"> tm10x_heart,</a> +<a class="sourceLine" id="cb855-3" data-line-number="3"> <span class="dt">cell_type_column =</span> <span class="st">"cell_type1"</span></a> +<a class="sourceLine" id="cb855-4" data-line-number="4">)</a></code></pre></div> <p><code>scfind</code> adopts a two-step compression strategy which allows efficient compression of large cell-by-gene matrix and allows fast retrieval of data by gene query. We estimated that one can achieve 2 orders of magnitude compression @@ -830,44 +830,44 @@ can also merge all tissues together to create a super index using the function <code>mergeDataset</code>.</p> <p>The index can be saved in .rds format using <code>saveObject</code> function and loaded using <code>loadObject</code> function for future use.</p> -<div class="sourceCode" id="cb773"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb773-1" data-line-number="1">tm10x_thymus <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/sce/Thymus_10X.rds"</span>)</a> -<a class="sourceLine" id="cb773-2" data-line-number="2">thymus_index <-<span class="st"> </span><span class="kw">buildCellTypeIndex</span>(</a> -<a class="sourceLine" id="cb773-3" data-line-number="3"> tm10x_thymus, </a> -<a class="sourceLine" id="cb773-4" data-line-number="4"> <span class="dt">cell_type_column =</span> <span class="st">"cell_type1"</span></a> -<a class="sourceLine" id="cb773-5" data-line-number="5">)</a> -<a class="sourceLine" id="cb773-6" data-line-number="6"><span class="co">## scfind_index <- mergeDataset(heart_index, thymus_index)</span></a> -<a class="sourceLine" id="cb773-7" data-line-number="7"><span class="co">## scfind_index@datasets</span></a> -<a class="sourceLine" id="cb773-8" data-line-number="8"><span class="co">## cellTypeNames(scfind_index)</span></a> -<a class="sourceLine" id="cb773-9" data-line-number="9"><span class="co">## sample(scfindGenes(scfind_index),20)</span></a></code></pre></div> +<div class="sourceCode" id="cb856"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb856-1" data-line-number="1">tm10x_thymus <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/sce/Thymus_10X.rds"</span>)</a> +<a class="sourceLine" id="cb856-2" data-line-number="2">thymus_index <-<span class="st"> </span><span class="kw">buildCellTypeIndex</span>(</a> +<a class="sourceLine" id="cb856-3" data-line-number="3"> tm10x_thymus, </a> +<a class="sourceLine" id="cb856-4" data-line-number="4"> <span class="dt">cell_type_column =</span> <span class="st">"cell_type1"</span></a> +<a class="sourceLine" id="cb856-5" data-line-number="5">)</a> +<a class="sourceLine" id="cb856-6" data-line-number="6"><span class="co">## scfind_index <- mergeDataset(heart_index, thymus_index)</span></a> +<a class="sourceLine" id="cb856-7" data-line-number="7"><span class="co">## scfind_index@datasets</span></a> +<a class="sourceLine" id="cb856-8" data-line-number="8"><span class="co">## cellTypeNames(scfind_index)</span></a> +<a class="sourceLine" id="cb856-9" data-line-number="9"><span class="co">## sample(scfindGenes(scfind_index),20)</span></a></code></pre></div> <p>To quickly and easily find the enriched cell type using an interactive Shiny application use the following method:</p> </div> <div id="marker-genes" class="section level3"> <h3><span class="header-section-number">14.1.4</span> Marker genes</h3> <p>Now let’s find the marker genes for Thymus T cell in the datasets</p> -<div class="sourceCode" id="cb774"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb774-1" data-line-number="1"><span class="co"># Showing the top 5 marker genes for each cell type and sort by F1 score.</span></a> -<a class="sourceLine" id="cb774-2" data-line-number="2">t_cell_markers <-<span class="st"> </span><span class="kw">cellTypeMarkers</span>(scfind_index, <span class="dt">cell.types =</span> <span class="st">"Thymus.T cell"</span>, <span class="dt">top.k =</span> <span class="dv">5</span>, <span class="dt">sort.field =</span> <span class="st">"f1"</span>) </a> -<a class="sourceLine" id="cb774-3" data-line-number="3"></a> -<a class="sourceLine" id="cb774-4" data-line-number="4">t_cell_markers</a></code></pre></div> +<div class="sourceCode" id="cb857"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb857-1" data-line-number="1"><span class="co"># Showing the top 5 marker genes for each cell type and sort by F1 score.</span></a> +<a class="sourceLine" id="cb857-2" data-line-number="2">t_cell_markers <-<span class="st"> </span><span class="kw">cellTypeMarkers</span>(scfind_index, <span class="dt">cell.types =</span> <span class="st">"Thymus.T cell"</span>, <span class="dt">top.k =</span> <span class="dv">5</span>, <span class="dt">sort.field =</span> <span class="st">"f1"</span>) </a> +<a class="sourceLine" id="cb857-3" data-line-number="3"></a> +<a class="sourceLine" id="cb857-4" data-line-number="4">t_cell_markers</a></code></pre></div> <p>Next, you can evaluate the markers of Thymus T cell in Thymus stromal cell</p> -<div class="sourceCode" id="cb775"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb775-1" data-line-number="1"><span class="kw">evaluateMarkers</span>(</a> -<a class="sourceLine" id="cb775-2" data-line-number="2"> scfind_index, </a> -<a class="sourceLine" id="cb775-3" data-line-number="3"> <span class="dt">gene.list =</span> <span class="kw">as.character</span>(t_cell_markers<span class="op">$</span>genes), </a> -<a class="sourceLine" id="cb775-4" data-line-number="4"> <span class="dt">cell.types =</span> <span class="st">"Thymus.stromal cell"</span>, </a> -<a class="sourceLine" id="cb775-5" data-line-number="5"> <span class="dt">sort.field =</span> <span class="st">"f1"</span></a> -<a class="sourceLine" id="cb775-6" data-line-number="6">)</a></code></pre></div> -<div class="sourceCode" id="cb776"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb776-1" data-line-number="1"><span class="co"># By default, the marker evaluation takes all cell types in the dataset as background cell type, but you can use the argument `background.cell.types` to fine tune the evaluation</span></a> -<a class="sourceLine" id="cb776-2" data-line-number="2"></a> -<a class="sourceLine" id="cb776-3" data-line-number="3">background <-<span class="st"> </span><span class="kw">cellTypeNames</span>(scfind_index, <span class="dt">datasets =</span> <span class="st">"Thymus"</span>)</a> -<a class="sourceLine" id="cb776-4" data-line-number="4">background</a> -<a class="sourceLine" id="cb776-5" data-line-number="5"></a> -<a class="sourceLine" id="cb776-6" data-line-number="6"><span class="kw">evaluateMarkers</span>(</a> -<a class="sourceLine" id="cb776-7" data-line-number="7"> scfind_index, </a> -<a class="sourceLine" id="cb776-8" data-line-number="8"> <span class="dt">gene.list =</span> <span class="kw">as.character</span>(t_cell_markers<span class="op">$</span>genes), </a> -<a class="sourceLine" id="cb776-9" data-line-number="9"> <span class="dt">cell.types =</span> <span class="st">"Thymus.stromal cell"</span>, </a> -<a class="sourceLine" id="cb776-10" data-line-number="10"> <span class="dt">sort.field =</span> <span class="st">"f1"</span>, </a> -<a class="sourceLine" id="cb776-11" data-line-number="11"> <span class="dt">background.cell.types =</span> background</a> -<a class="sourceLine" id="cb776-12" data-line-number="12">)</a></code></pre></div> +<div class="sourceCode" id="cb858"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb858-1" data-line-number="1"><span class="kw">evaluateMarkers</span>(</a> +<a class="sourceLine" id="cb858-2" data-line-number="2"> scfind_index, </a> +<a class="sourceLine" id="cb858-3" data-line-number="3"> <span class="dt">gene.list =</span> <span class="kw">as.character</span>(t_cell_markers<span class="op">$</span>genes), </a> +<a class="sourceLine" id="cb858-4" data-line-number="4"> <span class="dt">cell.types =</span> <span class="st">"Thymus.stromal cell"</span>, </a> +<a class="sourceLine" id="cb858-5" data-line-number="5"> <span class="dt">sort.field =</span> <span class="st">"f1"</span></a> +<a class="sourceLine" id="cb858-6" data-line-number="6">)</a></code></pre></div> +<div class="sourceCode" id="cb859"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb859-1" data-line-number="1"><span class="co"># By default, the marker evaluation takes all cell types in the dataset as background cell type, but you can use the argument `background.cell.types` to fine tune the evaluation</span></a> +<a class="sourceLine" id="cb859-2" data-line-number="2"></a> +<a class="sourceLine" id="cb859-3" data-line-number="3">background <-<span class="st"> </span><span class="kw">cellTypeNames</span>(scfind_index, <span class="dt">datasets =</span> <span class="st">"Thymus"</span>)</a> +<a class="sourceLine" id="cb859-4" data-line-number="4">background</a> +<a class="sourceLine" id="cb859-5" data-line-number="5"></a> +<a class="sourceLine" id="cb859-6" data-line-number="6"><span class="kw">evaluateMarkers</span>(</a> +<a class="sourceLine" id="cb859-7" data-line-number="7"> scfind_index, </a> +<a class="sourceLine" id="cb859-8" data-line-number="8"> <span class="dt">gene.list =</span> <span class="kw">as.character</span>(t_cell_markers<span class="op">$</span>genes), </a> +<a class="sourceLine" id="cb859-9" data-line-number="9"> <span class="dt">cell.types =</span> <span class="st">"Thymus.stromal cell"</span>, </a> +<a class="sourceLine" id="cb859-10" data-line-number="10"> <span class="dt">sort.field =</span> <span class="st">"f1"</span>, </a> +<a class="sourceLine" id="cb859-11" data-line-number="11"> <span class="dt">background.cell.types =</span> background</a> +<a class="sourceLine" id="cb859-12" data-line-number="12">)</a></code></pre></div> </div> <div id="search-cells-by-a-gene-list" class="section level3"> <h3><span class="header-section-number">14.1.5</span> Search cells by a gene list</h3> @@ -876,29 +876,29 @@ interest from large single cell dataset. We will use the marker genes identified in an original publication <a href="https://www.nature.com/articles/cr201599">Yanbin et al. 2015</a>. Cardiomyocyte-specific markers used in immunostaining as shown in Figure 1.</p> -<div class="sourceCode" id="cb777"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb777-1" data-line-number="1">cardiomyocytes <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"Mef2c"</span>, <span class="st">"Gata4"</span>, <span class="st">"Nkx2.5"</span>, <span class="st">"Myh6"</span>, <span class="st">"tnnt2"</span>, <span class="st">"tnni3"</span>, <span class="st">"CDH2"</span>, <span class="st">"Cx43"</span>, <span class="st">"GJA1"</span>) </a> -<a class="sourceLine" id="cb777-2" data-line-number="2">result <-<span class="st"> </span><span class="kw">markerGenes</span>(</a> -<a class="sourceLine" id="cb777-3" data-line-number="3"> scfind_index, </a> -<a class="sourceLine" id="cb777-4" data-line-number="4"> <span class="dt">gene.list =</span> cardiomyocytes</a> -<a class="sourceLine" id="cb777-5" data-line-number="5">)</a> -<a class="sourceLine" id="cb777-6" data-line-number="6">result</a></code></pre></div> +<div class="sourceCode" id="cb860"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb860-1" data-line-number="1">cardiomyocytes <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"Mef2c"</span>, <span class="st">"Gata4"</span>, <span class="st">"Nkx2.5"</span>, <span class="st">"Myh6"</span>, <span class="st">"tnnt2"</span>, <span class="st">"tnni3"</span>, <span class="st">"CDH2"</span>, <span class="st">"Cx43"</span>, <span class="st">"GJA1"</span>) </a> +<a class="sourceLine" id="cb860-2" data-line-number="2">result <-<span class="st"> </span><span class="kw">markerGenes</span>(</a> +<a class="sourceLine" id="cb860-3" data-line-number="3"> scfind_index, </a> +<a class="sourceLine" id="cb860-4" data-line-number="4"> <span class="dt">gene.list =</span> cardiomyocytes</a> +<a class="sourceLine" id="cb860-5" data-line-number="5">)</a> +<a class="sourceLine" id="cb860-6" data-line-number="6">result</a></code></pre></div> <p>To allow search of enriched cell type from a long list of gene query, <code>scfind</code> features a query optimization routine. First, the function <code>markerGenes</code> will counter suggest subqueries that with the highest support in the dataset. The TF-IDF score for each gene set allows user to identify the best subquery for finding the most relevant cell type.</p> -<div class="sourceCode" id="cb778"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb778-1" data-line-number="1">best_subquery <-<span class="st"> </span>result[<span class="kw">which.max</span>(result<span class="op">$</span>tfidf),] <span class="co"># get the best subquery by ranking TF-IDF score</span></a> -<a class="sourceLine" id="cb778-2" data-line-number="2">best_subquery <-<span class="st"> </span><span class="kw">strsplit</span>(<span class="kw">as.character</span>(best_subquery<span class="op">$</span>Query), <span class="st">","</span>)[[<span class="dv">1</span>]] <span class="co"># obtain gene list</span></a> -<a class="sourceLine" id="cb778-3" data-line-number="3"><span class="kw">hyperQueryCellTypes</span>(</a> -<a class="sourceLine" id="cb778-4" data-line-number="4"> scfind_index,</a> -<a class="sourceLine" id="cb778-5" data-line-number="5"> <span class="dt">gene.list =</span> best_subquery</a> -<a class="sourceLine" id="cb778-6" data-line-number="6">)</a></code></pre></div> +<div class="sourceCode" id="cb861"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb861-1" data-line-number="1">best_subquery <-<span class="st"> </span>result[<span class="kw">which.max</span>(result<span class="op">$</span>tfidf),] <span class="co"># get the best subquery by ranking TF-IDF score</span></a> +<a class="sourceLine" id="cb861-2" data-line-number="2">best_subquery <-<span class="st"> </span><span class="kw">strsplit</span>(<span class="kw">as.character</span>(best_subquery<span class="op">$</span>Query), <span class="st">","</span>)[[<span class="dv">1</span>]] <span class="co"># obtain gene list</span></a> +<a class="sourceLine" id="cb861-3" data-line-number="3"><span class="kw">hyperQueryCellTypes</span>(</a> +<a class="sourceLine" id="cb861-4" data-line-number="4"> scfind_index,</a> +<a class="sourceLine" id="cb861-5" data-line-number="5"> <span class="dt">gene.list =</span> best_subquery</a> +<a class="sourceLine" id="cb861-6" data-line-number="6">)</a></code></pre></div> <p><code>hyperQueryCellTypes</code> function returns a list of p-values corresponding to all cell types in a given dataset. It also outputs a list of cells in which genes from the given gene list are co-expressed.</p> <p><strong>Exercise 1</strong></p> <p>Find the marker genes of all cell types in the Heart dataset</p> -<div class="sourceCode" id="cb779"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb779-1" data-line-number="1">cardiac_contractility <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"Ace2"</span>,<span class="st">"Fkbp1b"</span>,<span class="st">"Gh"</span>,<span class="st">"Cacna1c"</span>,<span class="st">"Cd59b"</span>,<span class="st">"Ppp1r1a"</span>,<span class="st">"Tnnt2"</span>,<span class="st">"Nos1"</span>,<span class="st">"Agtr1a"</span>,<span class="st">"Camk2g"</span>,<span class="st">"Grk2"</span>,<span class="st">"Ins2"</span>,<span class="st">"Dnah8"</span>,<span class="st">"Igf1"</span>,<span class="st">"Nos3"</span>,<span class="st">"Nppa"</span>,<span class="st">"Nppb"</span>,<span class="st">"Il6"</span>,<span class="st">"Myh6"</span>,<span class="st">"Ren2"</span>,<span class="st">"Tnni3"</span>,<span class="st">"Apln"</span>,<span class="st">"Kcnmb1"</span>,<span class="st">"Pik3cg"</span>,<span class="st">"Prkca"</span>,<span class="st">"Aplnr"</span>,<span class="st">"Slc8a1"</span>,<span class="st">"Ace"</span>,<span class="st">"Akt1"</span>,<span class="st">"Edn1"</span>,<span class="st">"Kcnmb2"</span>,<span class="st">"Nos2"</span>,<span class="st">"Tnf"</span>,<span class="st">"Myh14"</span>,<span class="st">"Adrb2"</span>,<span class="st">"Agt"</span>,<span class="st">"Adrb1"</span>,<span class="st">"Atp2a2"</span>,<span class="st">"Ryr2"</span>,<span class="st">"Pln"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb862"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb862-1" data-line-number="1">cardiac_contractility <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"Ace2"</span>,<span class="st">"Fkbp1b"</span>,<span class="st">"Gh"</span>,<span class="st">"Cacna1c"</span>,<span class="st">"Cd59b"</span>,<span class="st">"Ppp1r1a"</span>,<span class="st">"Tnnt2"</span>,<span class="st">"Nos1"</span>,<span class="st">"Agtr1a"</span>,<span class="st">"Camk2g"</span>,<span class="st">"Grk2"</span>,<span class="st">"Ins2"</span>,<span class="st">"Dnah8"</span>,<span class="st">"Igf1"</span>,<span class="st">"Nos3"</span>,<span class="st">"Nppa"</span>,<span class="st">"Nppb"</span>,<span class="st">"Il6"</span>,<span class="st">"Myh6"</span>,<span class="st">"Ren2"</span>,<span class="st">"Tnni3"</span>,<span class="st">"Apln"</span>,<span class="st">"Kcnmb1"</span>,<span class="st">"Pik3cg"</span>,<span class="st">"Prkca"</span>,<span class="st">"Aplnr"</span>,<span class="st">"Slc8a1"</span>,<span class="st">"Ace"</span>,<span class="st">"Akt1"</span>,<span class="st">"Edn1"</span>,<span class="st">"Kcnmb2"</span>,<span class="st">"Nos2"</span>,<span class="st">"Tnf"</span>,<span class="st">"Myh14"</span>,<span class="st">"Adrb2"</span>,<span class="st">"Agt"</span>,<span class="st">"Adrb1"</span>,<span class="st">"Atp2a2"</span>,<span class="st">"Ryr2"</span>,<span class="st">"Pln"</span>)</a></code></pre></div> <p><strong>Exercise 2</strong></p> <p>Input the gene list relevant to “cardiac contractility†and find the best gene set with the highest support. Identify the enriched cell type for this query.</p> @@ -911,34 +911,34 @@ logical operators including “-†and "*" for “no†and “interm respectively in front of the gene name. Here, we use operators to subset T cell of the Thymus dataset into effector T regulatory cells and effector memory T cell.</p> -<div class="sourceCode" id="cb780"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb780-1" data-line-number="1">effector_t_reg_cells <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"*Ptprc"</span>, <span class="st">"-Il7r"</span>, <span class="st">"Ctla4"</span>, <span class="st">"-Il7r"</span>)</a> -<a class="sourceLine" id="cb780-2" data-line-number="2">effector_memory_t_cells <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"-Il2ra"</span>, <span class="st">"*Ptprc"</span>, <span class="st">"Il7r"</span>)</a> -<a class="sourceLine" id="cb780-3" data-line-number="3"></a> -<a class="sourceLine" id="cb780-4" data-line-number="4">subset_treg <-<span class="st"> </span><span class="kw">findCellTypes</span>(scfind_index, effector_t_reg_cells, <span class="st">"Thymus"</span>) </a> -<a class="sourceLine" id="cb780-5" data-line-number="5">subset_tmem <-<span class="st"> </span><span class="kw">findCellTypes</span>(scfind_index, effector_memory_t_cells, <span class="st">"Thymus"</span>)</a> -<a class="sourceLine" id="cb780-6" data-line-number="6"></a> -<a class="sourceLine" id="cb780-7" data-line-number="7">subset_treg</a> -<a class="sourceLine" id="cb780-8" data-line-number="8">subset_tmem</a></code></pre></div> +<div class="sourceCode" id="cb863"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb863-1" data-line-number="1">effector_t_reg_cells <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"*Ptprc"</span>, <span class="st">"-Il7r"</span>, <span class="st">"Ctla4"</span>, <span class="st">"-Il7r"</span>)</a> +<a class="sourceLine" id="cb863-2" data-line-number="2">effector_memory_t_cells <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"-Il2ra"</span>, <span class="st">"*Ptprc"</span>, <span class="st">"Il7r"</span>)</a> +<a class="sourceLine" id="cb863-3" data-line-number="3"></a> +<a class="sourceLine" id="cb863-4" data-line-number="4">subset_treg <-<span class="st"> </span><span class="kw">findCellTypes</span>(scfind_index, effector_t_reg_cells, <span class="st">"Thymus"</span>) </a> +<a class="sourceLine" id="cb863-5" data-line-number="5">subset_tmem <-<span class="st"> </span><span class="kw">findCellTypes</span>(scfind_index, effector_memory_t_cells, <span class="st">"Thymus"</span>)</a> +<a class="sourceLine" id="cb863-6" data-line-number="6"></a> +<a class="sourceLine" id="cb863-7" data-line-number="7">subset_treg</a> +<a class="sourceLine" id="cb863-8" data-line-number="8">subset_tmem</a></code></pre></div> <p>Let’s use the TSNE plot information from the <code>SingleCellExperiment</code> of Thymus to illustrate the gating result</p> -<div class="sourceCode" id="cb781"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb781-1" data-line-number="1">map <-<span class="st"> </span><span class="kw">data.frame</span>(</a> -<a class="sourceLine" id="cb781-2" data-line-number="2"> tm10x_thymus<span class="op">@</span>reducedDims[[<span class="st">'TSNE'</span>]], </a> -<a class="sourceLine" id="cb781-3" data-line-number="3"> <span class="dt">cell_type =</span> <span class="kw">as.character</span>(<span class="kw">colData</span>(tm10x_thymus)<span class="op">$</span>cell_type1), </a> -<a class="sourceLine" id="cb781-4" data-line-number="4"> <span class="dt">stringsAsFactors =</span> F</a> -<a class="sourceLine" id="cb781-5" data-line-number="5"> )</a> -<a class="sourceLine" id="cb781-6" data-line-number="6"></a> -<a class="sourceLine" id="cb781-7" data-line-number="7">map <-<span class="st"> </span><span class="kw">subset</span>(map, cell_type <span class="op">==</span><span class="st"> "T cell"</span>) </a> -<a class="sourceLine" id="cb781-8" data-line-number="8"></a> -<a class="sourceLine" id="cb781-9" data-line-number="9"><span class="kw">plot_ly</span>(map, <span class="dt">x =</span> <span class="op">~</span>X1 , <span class="dt">y =</span> <span class="op">~</span>X2, <span class="dt">type=</span><span class="st">"scatter"</span>)</a> -<a class="sourceLine" id="cb781-10" data-line-number="10"></a> -<a class="sourceLine" id="cb781-11" data-line-number="11">map<span class="op">$</span>cell_type[subset_treg<span class="op">$</span><span class="st">`</span><span class="dt">Thymus.T cell</span><span class="st">`</span>] <-<span class="st"> "Effector T Regulatory Cell"</span></a> -<a class="sourceLine" id="cb781-12" data-line-number="12">map<span class="op">$</span>cell_type[subset_tmem<span class="op">$</span><span class="st">`</span><span class="dt">Thymus.T cell</span><span class="st">`</span>] <-<span class="st"> "Effector Memory T Cell"</span></a> -<a class="sourceLine" id="cb781-13" data-line-number="13"></a> -<a class="sourceLine" id="cb781-14" data-line-number="14"><span class="kw">plot_ly</span>(map, <span class="dt">x =</span> <span class="op">~</span>X1 , <span class="dt">y =</span> <span class="op">~</span>X2, <span class="dt">type=</span><span class="st">"scatter"</span>, <span class="dt">color =</span> <span class="op">~</span>cell_type)</a></code></pre></div> +<div class="sourceCode" id="cb864"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb864-1" data-line-number="1">map <-<span class="st"> </span><span class="kw">data.frame</span>(</a> +<a class="sourceLine" id="cb864-2" data-line-number="2"> tm10x_thymus<span class="op">@</span>reducedDims[[<span class="st">'TSNE'</span>]], </a> +<a class="sourceLine" id="cb864-3" data-line-number="3"> <span class="dt">cell_type =</span> <span class="kw">as.character</span>(<span class="kw">colData</span>(tm10x_thymus)<span class="op">$</span>cell_type1), </a> +<a class="sourceLine" id="cb864-4" data-line-number="4"> <span class="dt">stringsAsFactors =</span> F</a> +<a class="sourceLine" id="cb864-5" data-line-number="5"> )</a> +<a class="sourceLine" id="cb864-6" data-line-number="6"></a> +<a class="sourceLine" id="cb864-7" data-line-number="7">map <-<span class="st"> </span><span class="kw">subset</span>(map, cell_type <span class="op">==</span><span class="st"> "T cell"</span>) </a> +<a class="sourceLine" id="cb864-8" data-line-number="8"></a> +<a class="sourceLine" id="cb864-9" data-line-number="9"><span class="kw">plot_ly</span>(map, <span class="dt">x =</span> <span class="op">~</span>X1 , <span class="dt">y =</span> <span class="op">~</span>X2, <span class="dt">type=</span><span class="st">"scatter"</span>)</a> +<a class="sourceLine" id="cb864-10" data-line-number="10"></a> +<a class="sourceLine" id="cb864-11" data-line-number="11">map<span class="op">$</span>cell_type[subset_treg<span class="op">$</span><span class="st">`</span><span class="dt">Thymus.T cell</span><span class="st">`</span>] <-<span class="st"> "Effector T Regulatory Cell"</span></a> +<a class="sourceLine" id="cb864-12" data-line-number="12">map<span class="op">$</span>cell_type[subset_tmem<span class="op">$</span><span class="st">`</span><span class="dt">Thymus.T cell</span><span class="st">`</span>] <-<span class="st"> "Effector Memory T Cell"</span></a> +<a class="sourceLine" id="cb864-13" data-line-number="13"></a> +<a class="sourceLine" id="cb864-14" data-line-number="14"><span class="kw">plot_ly</span>(map, <span class="dt">x =</span> <span class="op">~</span>X1 , <span class="dt">y =</span> <span class="op">~</span>X2, <span class="dt">type=</span><span class="st">"scatter"</span>, <span class="dt">color =</span> <span class="op">~</span>cell_type)</a></code></pre></div> </div> <div id="sessioninfo-11" class="section level3"> <h3><span class="header-section-number">14.1.7</span> sessionInfo()</h3> -<div class="sourceCode" id="cb782"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb782-1" data-line-number="1"><span class="kw">sessionInfo</span>()</a></code></pre></div> +<div class="sourceCode" id="cb865"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb865-1" data-line-number="1"><span class="kw">sessionInfo</span>()</a></code></pre></div> </div> </div> diff --git a/public/datasets.html b/public/datasets.html index f4295d2b264e2cc787bd54132749daba170c8577..74cab4e76d9fcbcc38c188294fa128196d9f00dd 100644 --- a/public/datasets.html +++ b/public/datasets.html @@ -339,7 +339,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.4.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-3"><i class="fa fa-check"></i><b>7.4.6</b> sessionInfo()</a></li> </ul></li> <li class="chapter" data-level="7.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#identifying-confounding-factors-reads"><i class="fa fa-check"></i><b>7.5</b> Identifying confounding factors (Reads)</a></li> -<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#dealing-with-confounders"><i class="fa fa-check"></i><b>7.6</b> Dealing with confounders</a><ul> +<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#batch-effects"><i class="fa fa-check"></i><b>7.6</b> Batch effects</a><ul> <li class="chapter" data-level="7.6.1" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#introduction-6"><i class="fa fa-check"></i><b>7.6.1</b> Introduction</a></li> <li class="chapter" data-level="7.6.2" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#linear-models"><i class="fa fa-check"></i><b>7.6.2</b> Linear models</a></li> <li class="chapter" data-level="7.6.3" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sctransform-2"><i class="fa fa-check"></i><b>7.6.3</b> sctransform</a></li> @@ -347,7 +347,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.6.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#combat"><i class="fa fa-check"></i><b>7.6.5</b> Combat</a></li> <li class="chapter" data-level="7.6.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#mnncorrect"><i class="fa fa-check"></i><b>7.6.6</b> mnnCorrect</a></li> <li class="chapter" data-level="7.6.7" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#harmony"><i class="fa fa-check"></i><b>7.6.7</b> Harmony</a></li> -<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-confounder-removal-strategies"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare confounder removal strategies</a></li> +<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-batch-correction"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare batch correction</a></li> <li class="chapter" data-level="7.6.9" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#big-exercise-2"><i class="fa fa-check"></i><b>7.6.9</b> Big Exercise</a></li> <li class="chapter" data-level="7.6.10" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-4"><i class="fa fa-check"></i><b>7.6.10</b> sessionInfo()</a></li> </ul></li> @@ -370,14 +370,13 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="9.1.2" data-path="latent-spaces.html"><a href="latent-spaces.html#tsne-t-distributed-stochastic-neighbor-embedding"><i class="fa fa-check"></i><b>9.1.2</b> tSNE: t-Distributed Stochastic Neighbor Embedding</a></li> <li class="chapter" data-level="9.1.3" data-path="latent-spaces.html"><a href="latent-spaces.html#manifold-methods"><i class="fa fa-check"></i><b>9.1.3</b> Manifold methods</a></li> </ul></li> -<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a></li> +<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a><ul> +<li class="chapter" data-level="9.2.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom-interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.2.1</b> <span>Slalom</span>: Interpretable latent spaces</a></li> +</ul></li> <li class="chapter" data-level="9.3" data-path="latent-spaces.html"><a href="latent-spaces.html#autoencoders"><i class="fa fa-check"></i><b>9.3</b> Autoencoders</a><ul> <li class="chapter" data-level="9.3.1" data-path="latent-spaces.html"><a href="latent-spaces.html#background-and-some-notations"><i class="fa fa-check"></i><b>9.3.1</b> Background and some notations</a></li> <li class="chapter" data-level="9.3.2" data-path="latent-spaces.html"><a href="latent-spaces.html#objective"><i class="fa fa-check"></i><b>9.3.2</b> Objective</a></li> </ul></li> -<li class="chapter" data-level="9.4" data-path="latent-spaces.html"><a href="latent-spaces.html#interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.4</b> Interpretable latent spaces</a><ul> -<li class="chapter" data-level="9.4.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom"><i class="fa fa-check"></i><b>9.4.1</b> Slalom</a></li> -</ul></li> </ul></li> <li class="chapter" data-level="10" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html"><i class="fa fa-check"></i><b>10</b> Clustering and cell annotation</a><ul> <li class="chapter" data-level="10.1" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html#clustering-methods"><i class="fa fa-check"></i><b>10.1</b> Clustering Methods</a><ul> @@ -399,15 +398,16 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="11.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#first-look-at-deng-data"><i class="fa fa-check"></i><b>11.1</b> First look at Deng data</a><ul> <li class="chapter" data-level="11.1.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#tscan"><i class="fa fa-check"></i><b>11.1.1</b> TSCAN</a></li> <li class="chapter" data-level="11.1.2" data-path="trajectory-inference.html"><a href="trajectory-inference.html#slingshot"><i class="fa fa-check"></i><b>11.1.2</b> Slingshot</a></li> -<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.3</b> Monocle</a></li> -<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.4</b> Monocle 2</a></li> -<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 3</a></li> -<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.6</b> Diffusion maps</a></li> -<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.7</b> Other methods</a></li> -<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.8</b> Comparison of the methods</a></li> -<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.9</b> Expression of genes through time</a></li> -<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.10</b> dynverse</a></li> -<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.11</b> sessionInfo()</a></li> +<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#gam-general-additive-model-for-identifying-temporally-expressed-genes"><i class="fa fa-check"></i><b>11.1.3</b> GAM general additive model for identifying temporally expressed genes</a></li> +<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.4</b> Monocle</a></li> +<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 2</a></li> +<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.6</b> Monocle 3</a></li> +<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.7</b> Diffusion maps</a></li> +<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.8</b> Other methods</a></li> +<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.9</b> Comparison of the methods</a></li> +<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.10</b> Expression of genes through time</a></li> +<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.11</b> dynverse</a></li> +<li class="chapter" data-level="11.1.12" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.12</b> sessionInfo()</a></li> </ul></li> </ul></li> <li class="chapter" data-level="12" data-path="dechapter.html"><a href="dechapter.html"><i class="fa fa-check"></i><b>12</b> Differential Expression (DE) analysis</a><ul> @@ -534,22 +534,22 @@ the 15/03/16. We will use these copies for reproducibility purposes.</p> </div> <div id="pancreas" class="section level2"> <h2><span class="header-section-number">4.3</span> Pancreas</h2> -<p>We have included two human pancreas datasets: from Muraro et al (2016) and -Segerstolpe et al. (2016). Since the pancreas has been widely studied, these +<p>We have included two human pancreas datasets: from Muraro et al (2016) <span class="citation">(Muraro et al. <a href="#ref-Muraro2016-yk">2016</a>)</span> and +Segerstolpe et al. (2016) <span class="citation">(Segerstolpe et al. <a href="#ref-Segerstolpe2016-wc">2016</a>)</span>. Since the pancreas has been widely studied, these datasets are well annotated.</p> <div id="muraro" class="section level3"> <h3><span class="header-section-number">4.3.1</span> Muraro</h3> -<p>Single-cell CEL-seq data were generated using a customised automated platform +<p>Single-cell CEL-seq2 data were generated using a customised automated platform that uses FACS, robotics, and the CEL-Seq2 protocol to obtain the transcriptomes of thousands of single pancreatic cells from four deceased organ donors. Cell -surface markers can be used for sorting and enriching certain cell types.</p> +surface markers can be used for sorting and enriching certain cell types.<span class="citation">(Muraro et al. <a href="#ref-Muraro2016-yk">2016</a>)</span></p> <p><a href="https://www.ncbi.nlm.nih.gov/pubmed/27693023">Muraro,M.J. et al. (2016) A Single-Cell Transcriptome Atlas of the Human Pancreas. Cell Syst, 3, 385–394.e3.</a></p> </div> <div id="segerstolpe" class="section level3"> <h3><span class="header-section-number">4.3.2</span> Segerstolpe</h3> <p>Single-cell RNA-seq dataset of human pancreatic cells from patients with type 2 diabetes and healthy controls. Single cells were prepared using Smart-seq2 -protocol and sequenced on an Illumina HiSeq 2000.</p> +protocol and sequenced on an Illumina HiSeq 2000.<span class="citation">(Segerstolpe et al. <a href="#ref-Segerstolpe2016-wc">2016</a>)</span></p> <p><a href="https://www.ncbi.nlm.nih.gov/pubmed/27667667">Segerstolpe,Ã…. et al. (2016) Single-Cell Transcriptome Profiling of Human Pancreatic Islets in Health and Type 2 Diabetes. Cell Metab., 24, 593–607.</a></p> </div> </div> @@ -759,6 +759,15 @@ rather than loading the whole thing into RAM.</p> <h2><span class="header-section-number">4.13</span> Advanced Exercise</h2> <p>Write an R function/script which will fully automate this procedure for each data-type for any tissue.</p> +</div> +</div> +<h3> References</h3> +<div id="refs" class="references"> +<div id="ref-Muraro2016-yk"> +<p>Muraro, Mauro J., Gitanjali Dharmadhikari, Dominic Grün, Nathalie Groen, Tim Dielen, Erik Jansen, Leon van Gurp, et al. 2016. “A Single-Cell Transcriptome Atlas of the Human Pancreas.†<em>Cell Systems</em> 3 (4). Elsevier BV: 385–394.e3. <a href="https://doi.org/10.1016/j.cels.2016.09.002">https://doi.org/10.1016/j.cels.2016.09.002</a>.</p> +</div> +<div id="ref-Segerstolpe2016-wc"> +<p>Segerstolpe, Ã…sa, Athanasia Palasantza, Pernilla Eliasson, Eva-Marie Andersson, Anne-Christine Andréasson, Xiaoyan Sun, Simone Picelli, et al. 2016. “Single-Cell Transcriptome Profiling of Human Pancreatic Islets in Health and Type 2 Diabetes.†<em>Cell Metabolism</em> 24 (4). Elsevier BV: 593–607. <a href="https://doi.org/10.1016/j.cmet.2016.08.020">https://doi.org/10.1016/j.cmet.2016.08.020</a>.</p> </div> </div> </section> diff --git a/public/datasets.md b/public/datasets.md index de1fa0d283773e4ef48e490e9d72309a4019cc84..33e40858d25ff80f92ef99ea33ab19554a806b24 100644 --- a/public/datasets.md +++ b/public/datasets.md @@ -37,16 +37,16 @@ the 15/03/16. We will use these copies for reproducibility purposes. ## Pancreas -We have included two human pancreas datasets: from Muraro et al (2016) and -Segerstolpe et al. (2016). Since the pancreas has been widely studied, these +We have included two human pancreas datasets: from Muraro et al (2016) [@Muraro2016-yk] and +Segerstolpe et al. (2016) [@Segerstolpe2016-wc]. Since the pancreas has been widely studied, these datasets are well annotated. ### Muraro -Single-cell CEL-seq data were generated using a customised automated platform +Single-cell CEL-seq2 data were generated using a customised automated platform that uses FACS, robotics, and the CEL-Seq2 protocol to obtain the transcriptomes of thousands of single pancreatic cells from four deceased organ donors. Cell -surface markers can be used for sorting and enriching certain cell types. +surface markers can be used for sorting and enriching certain cell types.[@Muraro2016-yk] [Muraro,M.J. et al. (2016) A Single-Cell Transcriptome Atlas of the Human Pancreas. Cell Syst, 3, 385–394.e3.](https://www.ncbi.nlm.nih.gov/pubmed/27693023) @@ -54,7 +54,7 @@ surface markers can be used for sorting and enriching certain cell types. Single-cell RNA-seq dataset of human pancreatic cells from patients with type 2 diabetes and healthy controls. Single cells were prepared using Smart-seq2 -protocol and sequenced on an Illumina HiSeq 2000. +protocol and sequenced on an Illumina HiSeq 2000.[@Segerstolpe2016-wc] [Segerstolpe,Ã…. et al. (2016) Single-Cell Transcriptome Profiling of Human Pancreatic Islets in Health and Type 2 Diabetes. Cell Metab., 24, 593–607.](https://www.ncbi.nlm.nih.gov/pubmed/27667667) diff --git a/public/de-intro.md b/public/de-intro.md index 1c34dcfda3a34619bc797fdf87828a227945205e..2cdade6d4ca467f53d960cfe535bff1045e7070a 100644 --- a/public/de-intro.md +++ b/public/de-intro.md @@ -59,6 +59,11 @@ hist( main = "Negative Binomial" ) ``` + +<div class="figure" style="text-align: center"> +<img src="de-intro_files/figure-html/nb-plot-1.png" alt="Negative Binomial distribution of read counts for a single gene across 1000 cells" width="90%" /> +<p class="caption">(\#fig:nb-plot)Negative Binomial distribution of read counts for a single gene across 1000 cells</p> +</div> Mean: $\mu = mu$ @@ -85,6 +90,11 @@ hist( main = "Zero-inflated NB" ) ``` + +<div class="figure" style="text-align: center"> +<img src="de-intro_files/figure-html/zero-inflation-plot-1.png" alt="Zero-inflated Negative Binomial distribution" width="90%" /> +<p class="caption">(\#fig:zero-inflation-plot)Zero-inflated Negative Binomial distribution</p> +</div> Mean: $\mu = mu \cdot (1 - d)$ @@ -109,6 +119,8 @@ hist( main = "Poisson-Beta" ) ``` + +<img src="de-intro_files/figure-html/pois-beta-plot-1.png" width="90%" style="display: block; margin: auto;" /> Mean: $\mu = g \cdot a / (a + b)$ diff --git a/public/de-intro_files/figure-html/nb-plot-1.png b/public/de-intro_files/figure-html/nb-plot-1.png index 368672109652a58ebad29fd59b11887f55493e1c..7b1072a4ec25de00b2d9b24eba7c927659784646 100644 Binary files a/public/de-intro_files/figure-html/nb-plot-1.png and b/public/de-intro_files/figure-html/nb-plot-1.png differ diff --git a/public/de-intro_files/figure-html/pois-beta-plot-1.png b/public/de-intro_files/figure-html/pois-beta-plot-1.png index 513810eeddf3c7e536a7c06399307930965fd4cc..43400a2739622e152beef51b0489155f1b320f5a 100644 Binary files a/public/de-intro_files/figure-html/pois-beta-plot-1.png and b/public/de-intro_files/figure-html/pois-beta-plot-1.png differ diff --git a/public/de-intro_files/figure-html/zero-inflation-plot-1.png b/public/de-intro_files/figure-html/zero-inflation-plot-1.png index 3a23305e19966ceaa10cd8684bbc536c433ee734..08f4cfeb854d8eb3ca975268b935ed909a837b48 100644 Binary files a/public/de-intro_files/figure-html/zero-inflation-plot-1.png and b/public/de-intro_files/figure-html/zero-inflation-plot-1.png differ diff --git a/public/dechapter.html b/public/dechapter.html index e59a090cb0922714646ba2cbe33e0e69dbfae78f..fd3cec6e7e7d4e6f1c67ff7aa09f42d9bbc4387f 100644 --- a/public/dechapter.html +++ b/public/dechapter.html @@ -339,7 +339,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.4.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-3"><i class="fa fa-check"></i><b>7.4.6</b> sessionInfo()</a></li> </ul></li> <li class="chapter" data-level="7.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#identifying-confounding-factors-reads"><i class="fa fa-check"></i><b>7.5</b> Identifying confounding factors (Reads)</a></li> -<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#dealing-with-confounders"><i class="fa fa-check"></i><b>7.6</b> Dealing with confounders</a><ul> +<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#batch-effects"><i class="fa fa-check"></i><b>7.6</b> Batch effects</a><ul> <li class="chapter" data-level="7.6.1" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#introduction-6"><i class="fa fa-check"></i><b>7.6.1</b> Introduction</a></li> <li class="chapter" data-level="7.6.2" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#linear-models"><i class="fa fa-check"></i><b>7.6.2</b> Linear models</a></li> <li class="chapter" data-level="7.6.3" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sctransform-2"><i class="fa fa-check"></i><b>7.6.3</b> sctransform</a></li> @@ -347,7 +347,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.6.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#combat"><i class="fa fa-check"></i><b>7.6.5</b> Combat</a></li> <li class="chapter" data-level="7.6.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#mnncorrect"><i class="fa fa-check"></i><b>7.6.6</b> mnnCorrect</a></li> <li class="chapter" data-level="7.6.7" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#harmony"><i class="fa fa-check"></i><b>7.6.7</b> Harmony</a></li> -<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-confounder-removal-strategies"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare confounder removal strategies</a></li> +<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-batch-correction"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare batch correction</a></li> <li class="chapter" data-level="7.6.9" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#big-exercise-2"><i class="fa fa-check"></i><b>7.6.9</b> Big Exercise</a></li> <li class="chapter" data-level="7.6.10" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-4"><i class="fa fa-check"></i><b>7.6.10</b> sessionInfo()</a></li> </ul></li> @@ -370,14 +370,13 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="9.1.2" data-path="latent-spaces.html"><a href="latent-spaces.html#tsne-t-distributed-stochastic-neighbor-embedding"><i class="fa fa-check"></i><b>9.1.2</b> tSNE: t-Distributed Stochastic Neighbor Embedding</a></li> <li class="chapter" data-level="9.1.3" data-path="latent-spaces.html"><a href="latent-spaces.html#manifold-methods"><i class="fa fa-check"></i><b>9.1.3</b> Manifold methods</a></li> </ul></li> -<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a></li> +<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a><ul> +<li class="chapter" data-level="9.2.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom-interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.2.1</b> <span>Slalom</span>: Interpretable latent spaces</a></li> +</ul></li> <li class="chapter" data-level="9.3" data-path="latent-spaces.html"><a href="latent-spaces.html#autoencoders"><i class="fa fa-check"></i><b>9.3</b> Autoencoders</a><ul> <li class="chapter" data-level="9.3.1" data-path="latent-spaces.html"><a href="latent-spaces.html#background-and-some-notations"><i class="fa fa-check"></i><b>9.3.1</b> Background and some notations</a></li> <li class="chapter" data-level="9.3.2" data-path="latent-spaces.html"><a href="latent-spaces.html#objective"><i class="fa fa-check"></i><b>9.3.2</b> Objective</a></li> </ul></li> -<li class="chapter" data-level="9.4" data-path="latent-spaces.html"><a href="latent-spaces.html#interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.4</b> Interpretable latent spaces</a><ul> -<li class="chapter" data-level="9.4.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom"><i class="fa fa-check"></i><b>9.4.1</b> Slalom</a></li> -</ul></li> </ul></li> <li class="chapter" data-level="10" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html"><i class="fa fa-check"></i><b>10</b> Clustering and cell annotation</a><ul> <li class="chapter" data-level="10.1" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html#clustering-methods"><i class="fa fa-check"></i><b>10.1</b> Clustering Methods</a><ul> @@ -399,15 +398,16 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="11.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#first-look-at-deng-data"><i class="fa fa-check"></i><b>11.1</b> First look at Deng data</a><ul> <li class="chapter" data-level="11.1.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#tscan"><i class="fa fa-check"></i><b>11.1.1</b> TSCAN</a></li> <li class="chapter" data-level="11.1.2" data-path="trajectory-inference.html"><a href="trajectory-inference.html#slingshot"><i class="fa fa-check"></i><b>11.1.2</b> Slingshot</a></li> -<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.3</b> Monocle</a></li> -<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.4</b> Monocle 2</a></li> -<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 3</a></li> -<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.6</b> Diffusion maps</a></li> -<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.7</b> Other methods</a></li> -<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.8</b> Comparison of the methods</a></li> -<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.9</b> Expression of genes through time</a></li> -<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.10</b> dynverse</a></li> -<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.11</b> sessionInfo()</a></li> +<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#gam-general-additive-model-for-identifying-temporally-expressed-genes"><i class="fa fa-check"></i><b>11.1.3</b> GAM general additive model for identifying temporally expressed genes</a></li> +<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.4</b> Monocle</a></li> +<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 2</a></li> +<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.6</b> Monocle 3</a></li> +<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.7</b> Diffusion maps</a></li> +<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.8</b> Other methods</a></li> +<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.9</b> Comparison of the methods</a></li> +<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.10</b> Expression of genes through time</a></li> +<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.11</b> dynverse</a></li> +<li class="chapter" data-level="11.1.12" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.12</b> sessionInfo()</a></li> </ul></li> </ul></li> <li class="chapter" data-level="12" data-path="dechapter.html"><a href="dechapter.html"><i class="fa fa-check"></i><b>12</b> Differential Expression (DE) analysis</a><ul> @@ -535,59 +535,72 @@ expressed genes either by comparing the differences in variance between the grou <div id="differences-in-distribution" class="section level3"> <h3><span class="header-section-number">12.1.3</span> Differences in Distribution</h3> <p>Unlike bulk RNA-seq, we generally have a large number of samples (i.e. cells) for each group we are comparing in single-cell experiments. Thus we can take advantage of the whole distribution of expression values in each group to identify differences between groups rather than only comparing estimates of mean-expression as is standard for bulk RNASeq.</p> -<p>There are two main approaches to comparing distributions. Firstly, we can use existing statistical models/distributions and fit the same type of model to the expression in each group then test for differences in the parameters for each model, or test whether the model fits better if a particular paramter is allowed to be different according to group. For instance in Chapter <a href="normalization-confounders-and-batch-correction.html#dealing-with-confounders">7.6</a> we used edgeR to test whether allowing mean expression to be different in different batches significantly improved the fit of a negative binomial model of the data.</p> +<p>There are two main approaches to comparing distributions. Firstly, we can use existing statistical models/distributions and fit the same type of model to the expression in each group then test for differences in the parameters for each model, or test whether the model fits better if a particular paramter is allowed to be different according to group. For instance in Chapter <a href="#dealing-with-confounders"><strong>??</strong></a> we used edgeR to test whether allowing mean expression to be different in different batches significantly improved the fit of a negative binomial model of the data.</p> <p>Alternatively, we can use a non-parametric test which does not assume that expression values follow any particular distribution, e.g. the <a href="https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test">Kolmogorov-Smirnov test (KS-test)</a>. Non-parametric tests generally convert observed expression values to ranks and test whether the distribution of ranks for one group are signficantly different from the distribution of ranks for the other group. However, some non-parametric methods fail in the presence of a large number of tied values, such as the case for dropouts (zeros) in single-cell RNA-seq expression data. Moreover, if the conditions for a parametric test hold, then it will typically be more powerful than a non-parametric test.</p> </div> <div id="models-of-single-cell-rnaseq-data" class="section level3"> <h3><span class="header-section-number">12.1.4</span> Models of single-cell RNASeq data</h3> <p>The most common model of RNASeq data is the negative binomial model:</p> -<div class="sourceCode" id="cb712"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb712-1" data-line-number="1"><span class="kw">set.seed</span>(<span class="dv">1</span>)</a> -<a class="sourceLine" id="cb712-2" data-line-number="2"><span class="kw">hist</span>(</a> -<a class="sourceLine" id="cb712-3" data-line-number="3"> <span class="kw">rnbinom</span>(</a> -<a class="sourceLine" id="cb712-4" data-line-number="4"> <span class="dv">1000</span>, </a> -<a class="sourceLine" id="cb712-5" data-line-number="5"> <span class="dt">mu =</span> <span class="dv">10</span>, </a> -<a class="sourceLine" id="cb712-6" data-line-number="6"> <span class="dt">size =</span> <span class="dv">100</span>), </a> -<a class="sourceLine" id="cb712-7" data-line-number="7"> <span class="dt">col =</span> <span class="st">"grey50"</span>, </a> -<a class="sourceLine" id="cb712-8" data-line-number="8"> <span class="dt">xlab =</span> <span class="st">"Read Counts"</span>, </a> -<a class="sourceLine" id="cb712-9" data-line-number="9"> <span class="dt">main =</span> <span class="st">"Negative Binomial"</span></a> -<a class="sourceLine" id="cb712-10" data-line-number="10">)</a></code></pre></div> +<div class="sourceCode" id="cb795"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb795-1" data-line-number="1"><span class="kw">set.seed</span>(<span class="dv">1</span>)</a> +<a class="sourceLine" id="cb795-2" data-line-number="2"><span class="kw">hist</span>(</a> +<a class="sourceLine" id="cb795-3" data-line-number="3"> <span class="kw">rnbinom</span>(</a> +<a class="sourceLine" id="cb795-4" data-line-number="4"> <span class="dv">1000</span>, </a> +<a class="sourceLine" id="cb795-5" data-line-number="5"> <span class="dt">mu =</span> <span class="dv">10</span>, </a> +<a class="sourceLine" id="cb795-6" data-line-number="6"> <span class="dt">size =</span> <span class="dv">100</span>), </a> +<a class="sourceLine" id="cb795-7" data-line-number="7"> <span class="dt">col =</span> <span class="st">"grey50"</span>, </a> +<a class="sourceLine" id="cb795-8" data-line-number="8"> <span class="dt">xlab =</span> <span class="st">"Read Counts"</span>, </a> +<a class="sourceLine" id="cb795-9" data-line-number="9"> <span class="dt">main =</span> <span class="st">"Negative Binomial"</span></a> +<a class="sourceLine" id="cb795-10" data-line-number="10">)</a></code></pre></div> +<div class="figure" style="text-align: center"><span id="fig:nb-plot"></span> +<img src="de-intro_files/figure-html/nb-plot-1.png" alt="Negative Binomial distribution of read counts for a single gene across 1000 cells" width="90%" /> +<p class="caption"> +Figure 12.1: Negative Binomial distribution of read counts for a single gene across 1000 cells +</p> +</div> <p>Mean: <span class="math inline">\(\mu = mu\)</span></p> <p>Variance: <span class="math inline">\(\sigma^2 = mu + mu^2/size\)</span></p> <p>It is parameterized by the mean expression (mu) and the dispersion (size), which is inversely related to the variance. The negative binomial model fits bulk RNA-seq data very well and it is used for most statistical methods designed for such data. In addition, it has been show to fit the distribution of molecule counts obtained from data tagged by unique molecular identifiers (UMIs) quite well (<a href="http://www.nature.com/nmeth/journal/v11/n6/full/nmeth.2930.html">Grun et al. 2014</a>, <a href="http://genome.cshlp.org/content/21/7/1160">Islam et al. 2011</a>).</p> <p>However, a raw negative binomial model does not fit full-length transcript data as well due to the high dropout rates relative to the non-zero read counts. For this type of data a variety of zero-inflated negative binomial models have been proposed (e.g. <a href="https://bioconductor.org/packages/release/bioc/html/MAST.html">MAST</a>, <a href="https://bioconductor.org/packages/release/bioc/html/scde.html">SCDE</a>).</p> -<div class="sourceCode" id="cb713"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb713-1" data-line-number="1">d <-<span class="st"> </span><span class="fl">0.5</span>;</a> -<a class="sourceLine" id="cb713-2" data-line-number="2">counts <-<span class="st"> </span><span class="kw">rnbinom</span>(</a> -<a class="sourceLine" id="cb713-3" data-line-number="3"> <span class="dv">1000</span>, </a> -<a class="sourceLine" id="cb713-4" data-line-number="4"> <span class="dt">mu =</span> <span class="dv">10</span>, </a> -<a class="sourceLine" id="cb713-5" data-line-number="5"> <span class="dt">size =</span> <span class="dv">100</span></a> -<a class="sourceLine" id="cb713-6" data-line-number="6">)</a> -<a class="sourceLine" id="cb713-7" data-line-number="7">counts[<span class="kw">runif</span>(<span class="dv">1000</span>) <span class="op"><</span><span class="st"> </span>d] <-<span class="st"> </span><span class="dv">0</span></a> -<a class="sourceLine" id="cb713-8" data-line-number="8"><span class="kw">hist</span>(</a> -<a class="sourceLine" id="cb713-9" data-line-number="9"> counts, </a> -<a class="sourceLine" id="cb713-10" data-line-number="10"> <span class="dt">col =</span> <span class="st">"grey50"</span>, </a> -<a class="sourceLine" id="cb713-11" data-line-number="11"> <span class="dt">xlab =</span> <span class="st">"Read Counts"</span>, </a> -<a class="sourceLine" id="cb713-12" data-line-number="12"> <span class="dt">main =</span> <span class="st">"Zero-inflated NB"</span></a> -<a class="sourceLine" id="cb713-13" data-line-number="13">)</a></code></pre></div> +<div class="sourceCode" id="cb796"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb796-1" data-line-number="1">d <-<span class="st"> </span><span class="fl">0.5</span>;</a> +<a class="sourceLine" id="cb796-2" data-line-number="2">counts <-<span class="st"> </span><span class="kw">rnbinom</span>(</a> +<a class="sourceLine" id="cb796-3" data-line-number="3"> <span class="dv">1000</span>, </a> +<a class="sourceLine" id="cb796-4" data-line-number="4"> <span class="dt">mu =</span> <span class="dv">10</span>, </a> +<a class="sourceLine" id="cb796-5" data-line-number="5"> <span class="dt">size =</span> <span class="dv">100</span></a> +<a class="sourceLine" id="cb796-6" data-line-number="6">)</a> +<a class="sourceLine" id="cb796-7" data-line-number="7">counts[<span class="kw">runif</span>(<span class="dv">1000</span>) <span class="op"><</span><span class="st"> </span>d] <-<span class="st"> </span><span class="dv">0</span></a> +<a class="sourceLine" id="cb796-8" data-line-number="8"><span class="kw">hist</span>(</a> +<a class="sourceLine" id="cb796-9" data-line-number="9"> counts, </a> +<a class="sourceLine" id="cb796-10" data-line-number="10"> <span class="dt">col =</span> <span class="st">"grey50"</span>, </a> +<a class="sourceLine" id="cb796-11" data-line-number="11"> <span class="dt">xlab =</span> <span class="st">"Read Counts"</span>, </a> +<a class="sourceLine" id="cb796-12" data-line-number="12"> <span class="dt">main =</span> <span class="st">"Zero-inflated NB"</span></a> +<a class="sourceLine" id="cb796-13" data-line-number="13">)</a></code></pre></div> +<div class="figure" style="text-align: center"><span id="fig:zero-inflation-plot"></span> +<img src="de-intro_files/figure-html/zero-inflation-plot-1.png" alt="Zero-inflated Negative Binomial distribution" width="90%" /> +<p class="caption"> +Figure 12.2: Zero-inflated Negative Binomial distribution +</p> +</div> <p>Mean: <span class="math inline">\(\mu = mu \cdot (1 - d)\)</span></p> <p>Variance: <span class="math inline">\(\sigma^2 = \mu \cdot (1-d) \cdot (1 + d \cdot \mu + \mu / size)\)</span></p> <p>These models introduce a new parameter <span class="math inline">\(d\)</span>, for the dropout rate, to the negative binomial model. As we saw in Chapter 19, the dropout rate of a gene is strongly correlated with the mean expression of the gene. Different zero-inflated negative binomial models use different relationships between mu and d and some may fit <span class="math inline">\(\mu\)</span> and <span class="math inline">\(d\)</span> to the expression of each gene independently.</p> <p>Finally, several methods use a Poisson-Beta distribution which is based on a mechanistic model of transcriptional bursting. There is strong experimental support for this model (<a href="https://genomebiology.biomedcentral.com/articles/10.1186/gb-2013-14-1-r7">Kim and Marioni, 2013</a>) and it provides a good fit to scRNA-seq data but it is less easy to use than the negative-binomial models and much less existing methods upon which to build than the negative binomial model.</p> -<div class="sourceCode" id="cb714"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb714-1" data-line-number="1">a <-<span class="st"> </span><span class="fl">0.1</span></a> -<a class="sourceLine" id="cb714-2" data-line-number="2">b <-<span class="st"> </span><span class="fl">0.1</span></a> -<a class="sourceLine" id="cb714-3" data-line-number="3">g <-<span class="st"> </span><span class="dv">100</span></a> -<a class="sourceLine" id="cb714-4" data-line-number="4">lambdas <-<span class="st"> </span><span class="kw">rbeta</span>(<span class="dv">1000</span>, a, b)</a> -<a class="sourceLine" id="cb714-5" data-line-number="5">counts <-<span class="st"> </span><span class="kw">sapply</span>(g<span class="op">*</span>lambdas, <span class="cf">function</span>(l) {<span class="kw">rpois</span>(<span class="dv">1</span>, <span class="dt">lambda =</span> l)})</a> -<a class="sourceLine" id="cb714-6" data-line-number="6"><span class="kw">hist</span>(</a> -<a class="sourceLine" id="cb714-7" data-line-number="7"> counts, </a> -<a class="sourceLine" id="cb714-8" data-line-number="8"> <span class="dt">col =</span> <span class="st">"grey50"</span>, </a> -<a class="sourceLine" id="cb714-9" data-line-number="9"> <span class="dt">xlab =</span> <span class="st">"Read Counts"</span>, </a> -<a class="sourceLine" id="cb714-10" data-line-number="10"> <span class="dt">main =</span> <span class="st">"Poisson-Beta"</span></a> -<a class="sourceLine" id="cb714-11" data-line-number="11">)</a></code></pre></div> -<p>Mean: +<div class="sourceCode" id="cb797"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb797-1" data-line-number="1">a <-<span class="st"> </span><span class="fl">0.1</span></a> +<a class="sourceLine" id="cb797-2" data-line-number="2">b <-<span class="st"> </span><span class="fl">0.1</span></a> +<a class="sourceLine" id="cb797-3" data-line-number="3">g <-<span class="st"> </span><span class="dv">100</span></a> +<a class="sourceLine" id="cb797-4" data-line-number="4">lambdas <-<span class="st"> </span><span class="kw">rbeta</span>(<span class="dv">1000</span>, a, b)</a> +<a class="sourceLine" id="cb797-5" data-line-number="5">counts <-<span class="st"> </span><span class="kw">sapply</span>(g<span class="op">*</span>lambdas, <span class="cf">function</span>(l) {<span class="kw">rpois</span>(<span class="dv">1</span>, <span class="dt">lambda =</span> l)})</a> +<a class="sourceLine" id="cb797-6" data-line-number="6"><span class="kw">hist</span>(</a> +<a class="sourceLine" id="cb797-7" data-line-number="7"> counts, </a> +<a class="sourceLine" id="cb797-8" data-line-number="8"> <span class="dt">col =</span> <span class="st">"grey50"</span>, </a> +<a class="sourceLine" id="cb797-9" data-line-number="9"> <span class="dt">xlab =</span> <span class="st">"Read Counts"</span>, </a> +<a class="sourceLine" id="cb797-10" data-line-number="10"> <span class="dt">main =</span> <span class="st">"Poisson-Beta"</span></a> +<a class="sourceLine" id="cb797-11" data-line-number="11">)</a></code></pre></div> +<p><img src="de-intro_files/figure-html/pois-beta-plot-1.png" width="90%" style="display: block; margin: auto;" /> +Mean: <span class="math inline">\(\mu = g \cdot a / (a + b)\)</span></p> <p>Variance: <span class="math inline">\(\sigma^2 = g^2 \cdot a \cdot b/((a + b + 1) \cdot (a + b)^2)\)</span></p> @@ -599,38 +612,38 @@ expressed genes either by comparing the differences in variance between the grou </div> <div id="de-in-a-real-dataset" class="section level2"> <h2><span class="header-section-number">12.2</span> DE in a real dataset</h2> -<div class="sourceCode" id="cb715"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb715-1" data-line-number="1"><span class="kw">library</span>(scRNA.seq.funcs)</a> -<a class="sourceLine" id="cb715-2" data-line-number="2"><span class="kw">library</span>(edgeR)</a> -<a class="sourceLine" id="cb715-3" data-line-number="3"><span class="kw">library</span>(monocle)</a> -<a class="sourceLine" id="cb715-4" data-line-number="4"><span class="kw">library</span>(MAST)</a> -<a class="sourceLine" id="cb715-5" data-line-number="5"><span class="kw">library</span>(ROCR)</a> -<a class="sourceLine" id="cb715-6" data-line-number="6"><span class="kw">set.seed</span>(<span class="dv">1</span>)</a></code></pre></div> +<div class="sourceCode" id="cb798"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb798-1" data-line-number="1"><span class="kw">library</span>(scRNA.seq.funcs)</a> +<a class="sourceLine" id="cb798-2" data-line-number="2"><span class="kw">library</span>(edgeR)</a> +<a class="sourceLine" id="cb798-3" data-line-number="3"><span class="kw">library</span>(monocle)</a> +<a class="sourceLine" id="cb798-4" data-line-number="4"><span class="kw">library</span>(MAST)</a> +<a class="sourceLine" id="cb798-5" data-line-number="5"><span class="kw">library</span>(ROCR)</a> +<a class="sourceLine" id="cb798-6" data-line-number="6"><span class="kw">set.seed</span>(<span class="dv">1</span>)</a></code></pre></div> <div id="introduction-8" class="section level3"> <h3><span class="header-section-number">12.2.1</span> Introduction</h3> <p>To test different single-cell differential expression methods we will be using the Blischak dataset from Chapters 7-17. For this experiment bulk RNA-seq data for each cell-line was generated in addition to single-cell data. We will use the differentially expressed genes identified using standard methods on the respective bulk data as the ground truth for evaluating the accuracy of each single-cell method. To save time we have pre-computed these for you. You can run the commands below to load these data.</p> -<div class="sourceCode" id="cb716"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb716-1" data-line-number="1">DE <-<span class="st"> </span><span class="kw">read.table</span>(<span class="st">"data/tung/TPs.txt"</span>)</a> -<a class="sourceLine" id="cb716-2" data-line-number="2">notDE <-<span class="st"> </span><span class="kw">read.table</span>(<span class="st">"data/tung/TNs.txt"</span>)</a> -<a class="sourceLine" id="cb716-3" data-line-number="3">GroundTruth <-<span class="st"> </span><span class="kw">list</span>(</a> -<a class="sourceLine" id="cb716-4" data-line-number="4"> <span class="dt">DE =</span> <span class="kw">as.character</span>(<span class="kw">unlist</span>(DE)), </a> -<a class="sourceLine" id="cb716-5" data-line-number="5"> <span class="dt">notDE =</span> <span class="kw">as.character</span>(<span class="kw">unlist</span>(notDE))</a> -<a class="sourceLine" id="cb716-6" data-line-number="6">)</a></code></pre></div> +<div class="sourceCode" id="cb799"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb799-1" data-line-number="1">DE <-<span class="st"> </span><span class="kw">read.table</span>(<span class="st">"data/tung/TPs.txt"</span>)</a> +<a class="sourceLine" id="cb799-2" data-line-number="2">notDE <-<span class="st"> </span><span class="kw">read.table</span>(<span class="st">"data/tung/TNs.txt"</span>)</a> +<a class="sourceLine" id="cb799-3" data-line-number="3">GroundTruth <-<span class="st"> </span><span class="kw">list</span>(</a> +<a class="sourceLine" id="cb799-4" data-line-number="4"> <span class="dt">DE =</span> <span class="kw">as.character</span>(<span class="kw">unlist</span>(DE)), </a> +<a class="sourceLine" id="cb799-5" data-line-number="5"> <span class="dt">notDE =</span> <span class="kw">as.character</span>(<span class="kw">unlist</span>(notDE))</a> +<a class="sourceLine" id="cb799-6" data-line-number="6">)</a></code></pre></div> <p>This ground truth has been produce for the comparison of individual NA19101 to NA19239. Now load the respective single-cell data:</p> -<div class="sourceCode" id="cb717"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb717-1" data-line-number="1">molecules <-<span class="st"> </span><span class="kw">read.table</span>(<span class="st">"data/tung/molecules.txt"</span>, <span class="dt">sep =</span> <span class="st">"</span><span class="ch">\t</span><span class="st">"</span>)</a> -<a class="sourceLine" id="cb717-2" data-line-number="2">anno <-<span class="st"> </span><span class="kw">read.table</span>(<span class="st">"data/tung/annotation.txt"</span>, <span class="dt">sep =</span> <span class="st">"</span><span class="ch">\t</span><span class="st">"</span>, <span class="dt">header =</span> <span class="ot">TRUE</span>)</a> -<a class="sourceLine" id="cb717-3" data-line-number="3">keep <-<span class="st"> </span>anno[,<span class="dv">1</span>] <span class="op">==</span><span class="st"> "NA19101"</span> <span class="op">|</span><span class="st"> </span>anno[,<span class="dv">1</span>] <span class="op">==</span><span class="st"> "NA19239"</span></a> -<a class="sourceLine" id="cb717-4" data-line-number="4">data <-<span class="st"> </span>molecules[,keep]</a> -<a class="sourceLine" id="cb717-5" data-line-number="5">group <-<span class="st"> </span>anno[keep,<span class="dv">1</span>]</a> -<a class="sourceLine" id="cb717-6" data-line-number="6">batch <-<span class="st"> </span>anno[keep,<span class="dv">4</span>]</a> -<a class="sourceLine" id="cb717-7" data-line-number="7"><span class="co"># remove genes that aren't expressed in at least 6 cells</span></a> -<a class="sourceLine" id="cb717-8" data-line-number="8">gkeep <-<span class="st"> </span><span class="kw">rowSums</span>(data <span class="op">></span><span class="st"> </span><span class="dv">0</span>) <span class="op">></span><span class="st"> </span><span class="dv">5</span>;</a> -<a class="sourceLine" id="cb717-9" data-line-number="9">counts <-<span class="st"> </span>data[gkeep,]</a> -<a class="sourceLine" id="cb717-10" data-line-number="10"><span class="co"># Library size normalization</span></a> -<a class="sourceLine" id="cb717-11" data-line-number="11">lib_size =<span class="st"> </span><span class="kw">colSums</span>(counts)</a> -<a class="sourceLine" id="cb717-12" data-line-number="12">norm <-<span class="st"> </span><span class="kw">t</span>(<span class="kw">t</span>(counts)<span class="op">/</span>lib_size <span class="op">*</span><span class="st"> </span><span class="kw">median</span>(lib_size)) </a> -<a class="sourceLine" id="cb717-13" data-line-number="13"><span class="co"># Variant of CPM for datasets with library sizes of fewer than 1 mil molecules</span></a></code></pre></div> +<div class="sourceCode" id="cb800"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb800-1" data-line-number="1">molecules <-<span class="st"> </span><span class="kw">read.table</span>(<span class="st">"data/tung/molecules.txt"</span>, <span class="dt">sep =</span> <span class="st">"</span><span class="ch">\t</span><span class="st">"</span>)</a> +<a class="sourceLine" id="cb800-2" data-line-number="2">anno <-<span class="st"> </span><span class="kw">read.table</span>(<span class="st">"data/tung/annotation.txt"</span>, <span class="dt">sep =</span> <span class="st">"</span><span class="ch">\t</span><span class="st">"</span>, <span class="dt">header =</span> <span class="ot">TRUE</span>)</a> +<a class="sourceLine" id="cb800-3" data-line-number="3">keep <-<span class="st"> </span>anno[,<span class="dv">1</span>] <span class="op">==</span><span class="st"> "NA19101"</span> <span class="op">|</span><span class="st"> </span>anno[,<span class="dv">1</span>] <span class="op">==</span><span class="st"> "NA19239"</span></a> +<a class="sourceLine" id="cb800-4" data-line-number="4">data <-<span class="st"> </span>molecules[,keep]</a> +<a class="sourceLine" id="cb800-5" data-line-number="5">group <-<span class="st"> </span>anno[keep,<span class="dv">1</span>]</a> +<a class="sourceLine" id="cb800-6" data-line-number="6">batch <-<span class="st"> </span>anno[keep,<span class="dv">4</span>]</a> +<a class="sourceLine" id="cb800-7" data-line-number="7"><span class="co"># remove genes that aren't expressed in at least 6 cells</span></a> +<a class="sourceLine" id="cb800-8" data-line-number="8">gkeep <-<span class="st"> </span><span class="kw">rowSums</span>(data <span class="op">></span><span class="st"> </span><span class="dv">0</span>) <span class="op">></span><span class="st"> </span><span class="dv">5</span>;</a> +<a class="sourceLine" id="cb800-9" data-line-number="9">counts <-<span class="st"> </span>data[gkeep,]</a> +<a class="sourceLine" id="cb800-10" data-line-number="10"><span class="co"># Library size normalization</span></a> +<a class="sourceLine" id="cb800-11" data-line-number="11">lib_size =<span class="st"> </span><span class="kw">colSums</span>(counts)</a> +<a class="sourceLine" id="cb800-12" data-line-number="12">norm <-<span class="st"> </span><span class="kw">t</span>(<span class="kw">t</span>(counts)<span class="op">/</span>lib_size <span class="op">*</span><span class="st"> </span><span class="kw">median</span>(lib_size)) </a> +<a class="sourceLine" id="cb800-13" data-line-number="13"><span class="co"># Variant of CPM for datasets with library sizes of fewer than 1 mil molecules</span></a></code></pre></div> <p>Now we will compare various single-cell DE methods. We will focus on methods that performed well in Soneson and Robinson’s [2019; CITE] detailed comparison of differential expression methods for single-cell data. Note that we will only @@ -651,96 +664,96 @@ number of identical values (eg. zeros). Another issue with the KS-test is that it can be very sensitive for large sample sizes and thus it may end up as significant even though the magnitude of the difference is very small.</p> <p>Now run the test:</p> -<div class="sourceCode" id="cb718"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb718-1" data-line-number="1">pVals <-<span class="st"> </span><span class="kw">apply</span>(</a> -<a class="sourceLine" id="cb718-2" data-line-number="2"> norm, <span class="dv">1</span>, <span class="cf">function</span>(x) {</a> -<a class="sourceLine" id="cb718-3" data-line-number="3"> <span class="kw">ks.test</span>(</a> -<a class="sourceLine" id="cb718-4" data-line-number="4"> x[group <span class="op">==</span><span class="st"> "NA19101"</span>], </a> -<a class="sourceLine" id="cb718-5" data-line-number="5"> x[group <span class="op">==</span><span class="st"> "NA19239"</span>]</a> -<a class="sourceLine" id="cb718-6" data-line-number="6"> )<span class="op">$</span>p.value</a> -<a class="sourceLine" id="cb718-7" data-line-number="7"> }</a> -<a class="sourceLine" id="cb718-8" data-line-number="8">)</a> -<a class="sourceLine" id="cb718-9" data-line-number="9"><span class="co"># multiple testing correction</span></a> -<a class="sourceLine" id="cb718-10" data-line-number="10">pVals <-<span class="st"> </span><span class="kw">p.adjust</span>(pVals, <span class="dt">method =</span> <span class="st">"fdr"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb801"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb801-1" data-line-number="1">pVals <-<span class="st"> </span><span class="kw">apply</span>(</a> +<a class="sourceLine" id="cb801-2" data-line-number="2"> norm, <span class="dv">1</span>, <span class="cf">function</span>(x) {</a> +<a class="sourceLine" id="cb801-3" data-line-number="3"> <span class="kw">ks.test</span>(</a> +<a class="sourceLine" id="cb801-4" data-line-number="4"> x[group <span class="op">==</span><span class="st"> "NA19101"</span>], </a> +<a class="sourceLine" id="cb801-5" data-line-number="5"> x[group <span class="op">==</span><span class="st"> "NA19239"</span>]</a> +<a class="sourceLine" id="cb801-6" data-line-number="6"> )<span class="op">$</span>p.value</a> +<a class="sourceLine" id="cb801-7" data-line-number="7"> }</a> +<a class="sourceLine" id="cb801-8" data-line-number="8">)</a> +<a class="sourceLine" id="cb801-9" data-line-number="9"><span class="co"># multiple testing correction</span></a> +<a class="sourceLine" id="cb801-10" data-line-number="10">pVals <-<span class="st"> </span><span class="kw">p.adjust</span>(pVals, <span class="dt">method =</span> <span class="st">"fdr"</span>)</a></code></pre></div> <p>This code “applies†the function to each row (specified by 1) of the expression matrix, data. In the function we are returning just the p.value from the ks.test output. We can now consider how many of the ground truth positive and negative DE genes are detected by the KS-test:</p> <div id="evaluating-accuracy" class="section level4"> <h4><span class="header-section-number">12.2.2.1</span> Evaluating Accuracy</h4> -<div class="sourceCode" id="cb719"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb719-1" data-line-number="1">sigDE <-<span class="st"> </span><span class="kw">names</span>(pVals)[pVals <span class="op"><</span><span class="st"> </span><span class="fl">0.05</span>]</a> -<a class="sourceLine" id="cb719-2" data-line-number="2"><span class="kw">length</span>(sigDE) </a> -<a class="sourceLine" id="cb719-3" data-line-number="3"><span class="co"># Number of KS-DE genes</span></a> -<a class="sourceLine" id="cb719-4" data-line-number="4"><span class="kw">sum</span>(GroundTruth<span class="op">$</span>DE <span class="op">%in%</span><span class="st"> </span>sigDE) </a> -<a class="sourceLine" id="cb719-5" data-line-number="5"><span class="co"># Number of KS-DE genes that are true DE genes</span></a> -<a class="sourceLine" id="cb719-6" data-line-number="6"><span class="kw">sum</span>(GroundTruth<span class="op">$</span>notDE <span class="op">%in%</span><span class="st"> </span>sigDE)</a> -<a class="sourceLine" id="cb719-7" data-line-number="7"><span class="co"># Number of KS-DE genes that are truly not-DE</span></a></code></pre></div> +<div class="sourceCode" id="cb802"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb802-1" data-line-number="1">sigDE <-<span class="st"> </span><span class="kw">names</span>(pVals)[pVals <span class="op"><</span><span class="st"> </span><span class="fl">0.05</span>]</a> +<a class="sourceLine" id="cb802-2" data-line-number="2"><span class="kw">length</span>(sigDE) </a> +<a class="sourceLine" id="cb802-3" data-line-number="3"><span class="co"># Number of KS-DE genes</span></a> +<a class="sourceLine" id="cb802-4" data-line-number="4"><span class="kw">sum</span>(GroundTruth<span class="op">$</span>DE <span class="op">%in%</span><span class="st"> </span>sigDE) </a> +<a class="sourceLine" id="cb802-5" data-line-number="5"><span class="co"># Number of KS-DE genes that are true DE genes</span></a> +<a class="sourceLine" id="cb802-6" data-line-number="6"><span class="kw">sum</span>(GroundTruth<span class="op">$</span>notDE <span class="op">%in%</span><span class="st"> </span>sigDE)</a> +<a class="sourceLine" id="cb802-7" data-line-number="7"><span class="co"># Number of KS-DE genes that are truly not-DE</span></a></code></pre></div> <p>As you can see many more of our ground truth negative genes were identified as DE by the KS-test (false positives) than ground truth positive genes (true positives), however this may be due to the larger number of notDE genes thus we typically normalize these counts as the True positive rate (TPR), TP/(TP + FN), and False positive rate (FPR), FP/(FP+TP).</p> -<div class="sourceCode" id="cb720"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb720-1" data-line-number="1">tp <-<span class="st"> </span><span class="kw">sum</span>(GroundTruth<span class="op">$</span>DE <span class="op">%in%</span><span class="st"> </span>sigDE)</a> -<a class="sourceLine" id="cb720-2" data-line-number="2">fp <-<span class="st"> </span><span class="kw">sum</span>(GroundTruth<span class="op">$</span>notDE <span class="op">%in%</span><span class="st"> </span>sigDE)</a> -<a class="sourceLine" id="cb720-3" data-line-number="3">tn <-<span class="st"> </span><span class="kw">sum</span>(GroundTruth<span class="op">$</span>notDE <span class="op">%in%</span><span class="st"> </span><span class="kw">names</span>(pVals)[pVals <span class="op">>=</span><span class="st"> </span><span class="fl">0.05</span>])</a> -<a class="sourceLine" id="cb720-4" data-line-number="4">fn <-<span class="st"> </span><span class="kw">sum</span>(GroundTruth<span class="op">$</span>DE <span class="op">%in%</span><span class="st"> </span><span class="kw">names</span>(pVals)[pVals <span class="op">>=</span><span class="st"> </span><span class="fl">0.05</span>])</a> -<a class="sourceLine" id="cb720-5" data-line-number="5">tpr <-<span class="st"> </span>tp<span class="op">/</span>(tp <span class="op">+</span><span class="st"> </span>fn)</a> -<a class="sourceLine" id="cb720-6" data-line-number="6">fpr <-<span class="st"> </span>fp<span class="op">/</span>(fp <span class="op">+</span><span class="st"> </span>tn)</a> -<a class="sourceLine" id="cb720-7" data-line-number="7"><span class="kw">cat</span>(<span class="kw">c</span>(tpr, fpr))</a></code></pre></div> +<div class="sourceCode" id="cb803"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb803-1" data-line-number="1">tp <-<span class="st"> </span><span class="kw">sum</span>(GroundTruth<span class="op">$</span>DE <span class="op">%in%</span><span class="st"> </span>sigDE)</a> +<a class="sourceLine" id="cb803-2" data-line-number="2">fp <-<span class="st"> </span><span class="kw">sum</span>(GroundTruth<span class="op">$</span>notDE <span class="op">%in%</span><span class="st"> </span>sigDE)</a> +<a class="sourceLine" id="cb803-3" data-line-number="3">tn <-<span class="st"> </span><span class="kw">sum</span>(GroundTruth<span class="op">$</span>notDE <span class="op">%in%</span><span class="st"> </span><span class="kw">names</span>(pVals)[pVals <span class="op">>=</span><span class="st"> </span><span class="fl">0.05</span>])</a> +<a class="sourceLine" id="cb803-4" data-line-number="4">fn <-<span class="st"> </span><span class="kw">sum</span>(GroundTruth<span class="op">$</span>DE <span class="op">%in%</span><span class="st"> </span><span class="kw">names</span>(pVals)[pVals <span class="op">>=</span><span class="st"> </span><span class="fl">0.05</span>])</a> +<a class="sourceLine" id="cb803-5" data-line-number="5">tpr <-<span class="st"> </span>tp<span class="op">/</span>(tp <span class="op">+</span><span class="st"> </span>fn)</a> +<a class="sourceLine" id="cb803-6" data-line-number="6">fpr <-<span class="st"> </span>fp<span class="op">/</span>(fp <span class="op">+</span><span class="st"> </span>tn)</a> +<a class="sourceLine" id="cb803-7" data-line-number="7"><span class="kw">cat</span>(<span class="kw">c</span>(tpr, fpr))</a></code></pre></div> <p>Now we can see the TPR is much higher than the FPR indicating the KS test is identifying DE genes.</p> <p>So far we’ve only evaluated the performance at a single significance threshold. Often it is informative to vary the threshold and evaluate performance across a range of values. This is then plotted as a receiver-operating-characteristic curve (ROC) and a general accuracy statistic can be calculated as the area under this curve (AUC). We will use the ROCR package to facilitate this plotting.</p> -<div class="sourceCode" id="cb721"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb721-1" data-line-number="1"><span class="co"># Only consider genes for which we know the ground truth</span></a> -<a class="sourceLine" id="cb721-2" data-line-number="2">pVals <-<span class="st"> </span>pVals[<span class="kw">names</span>(pVals) <span class="op">%in%</span><span class="st"> </span>GroundTruth<span class="op">$</span>DE <span class="op">|</span><span class="st"> </span></a> -<a class="sourceLine" id="cb721-3" data-line-number="3"><span class="st"> </span><span class="kw">names</span>(pVals) <span class="op">%in%</span><span class="st"> </span>GroundTruth<span class="op">$</span>notDE] </a> -<a class="sourceLine" id="cb721-4" data-line-number="4">truth <-<span class="st"> </span><span class="kw">rep</span>(<span class="dv">1</span>, <span class="dt">times =</span> <span class="kw">length</span>(pVals));</a> -<a class="sourceLine" id="cb721-5" data-line-number="5">truth[<span class="kw">names</span>(pVals) <span class="op">%in%</span><span class="st"> </span>GroundTruth<span class="op">$</span>DE] =<span class="st"> </span><span class="dv">0</span>;</a> -<a class="sourceLine" id="cb721-6" data-line-number="6">pred <-<span class="st"> </span>ROCR<span class="op">::</span><span class="kw">prediction</span>(pVals, truth)</a> -<a class="sourceLine" id="cb721-7" data-line-number="7">perf <-<span class="st"> </span>ROCR<span class="op">::</span><span class="kw">performance</span>(pred, <span class="st">"tpr"</span>, <span class="st">"fpr"</span>)</a> -<a class="sourceLine" id="cb721-8" data-line-number="8">ROCR<span class="op">::</span><span class="kw">plot</span>(perf)</a> -<a class="sourceLine" id="cb721-9" data-line-number="9">aucObj <-<span class="st"> </span>ROCR<span class="op">::</span><span class="kw">performance</span>(pred, <span class="st">"auc"</span>)</a> -<a class="sourceLine" id="cb721-10" data-line-number="10">aucObj<span class="op">@</span>y.values[[<span class="dv">1</span>]] <span class="co"># AUC</span></a></code></pre></div> +<div class="sourceCode" id="cb804"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb804-1" data-line-number="1"><span class="co"># Only consider genes for which we know the ground truth</span></a> +<a class="sourceLine" id="cb804-2" data-line-number="2">pVals <-<span class="st"> </span>pVals[<span class="kw">names</span>(pVals) <span class="op">%in%</span><span class="st"> </span>GroundTruth<span class="op">$</span>DE <span class="op">|</span><span class="st"> </span></a> +<a class="sourceLine" id="cb804-3" data-line-number="3"><span class="st"> </span><span class="kw">names</span>(pVals) <span class="op">%in%</span><span class="st"> </span>GroundTruth<span class="op">$</span>notDE] </a> +<a class="sourceLine" id="cb804-4" data-line-number="4">truth <-<span class="st"> </span><span class="kw">rep</span>(<span class="dv">1</span>, <span class="dt">times =</span> <span class="kw">length</span>(pVals));</a> +<a class="sourceLine" id="cb804-5" data-line-number="5">truth[<span class="kw">names</span>(pVals) <span class="op">%in%</span><span class="st"> </span>GroundTruth<span class="op">$</span>DE] =<span class="st"> </span><span class="dv">0</span>;</a> +<a class="sourceLine" id="cb804-6" data-line-number="6">pred <-<span class="st"> </span>ROCR<span class="op">::</span><span class="kw">prediction</span>(pVals, truth)</a> +<a class="sourceLine" id="cb804-7" data-line-number="7">perf <-<span class="st"> </span>ROCR<span class="op">::</span><span class="kw">performance</span>(pred, <span class="st">"tpr"</span>, <span class="st">"fpr"</span>)</a> +<a class="sourceLine" id="cb804-8" data-line-number="8">ROCR<span class="op">::</span><span class="kw">plot</span>(perf)</a> +<a class="sourceLine" id="cb804-9" data-line-number="9">aucObj <-<span class="st"> </span>ROCR<span class="op">::</span><span class="kw">performance</span>(pred, <span class="st">"auc"</span>)</a> +<a class="sourceLine" id="cb804-10" data-line-number="10">aucObj<span class="op">@</span>y.values[[<span class="dv">1</span>]] <span class="co"># AUC</span></a></code></pre></div> <p>Finally to facilitate the comparisons of other DE methods let’s put this code into a function so we don’t need to repeat it:</p> -<div class="sourceCode" id="cb722"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb722-1" data-line-number="1">DE_Quality_AUC <-<span class="st"> </span><span class="cf">function</span>(pVals) {</a> -<a class="sourceLine" id="cb722-2" data-line-number="2"> pVals <-<span class="st"> </span>pVals[<span class="kw">names</span>(pVals) <span class="op">%in%</span><span class="st"> </span>GroundTruth<span class="op">$</span>DE <span class="op">|</span><span class="st"> </span></a> -<a class="sourceLine" id="cb722-3" data-line-number="3"><span class="st"> </span><span class="kw">names</span>(pVals) <span class="op">%in%</span><span class="st"> </span>GroundTruth<span class="op">$</span>notDE]</a> -<a class="sourceLine" id="cb722-4" data-line-number="4"> truth <-<span class="st"> </span><span class="kw">rep</span>(<span class="dv">1</span>, <span class="dt">times =</span> <span class="kw">length</span>(pVals));</a> -<a class="sourceLine" id="cb722-5" data-line-number="5"> truth[<span class="kw">names</span>(pVals) <span class="op">%in%</span><span class="st"> </span>GroundTruth<span class="op">$</span>DE] =<span class="st"> </span><span class="dv">0</span>;</a> -<a class="sourceLine" id="cb722-6" data-line-number="6"> pred <-<span class="st"> </span>ROCR<span class="op">::</span><span class="kw">prediction</span>(pVals, truth)</a> -<a class="sourceLine" id="cb722-7" data-line-number="7"> perf <-<span class="st"> </span>ROCR<span class="op">::</span><span class="kw">performance</span>(pred, <span class="st">"tpr"</span>, <span class="st">"fpr"</span>)</a> -<a class="sourceLine" id="cb722-8" data-line-number="8"> ROCR<span class="op">::</span><span class="kw">plot</span>(perf)</a> -<a class="sourceLine" id="cb722-9" data-line-number="9"> aucObj <-<span class="st"> </span>ROCR<span class="op">::</span><span class="kw">performance</span>(pred, <span class="st">"auc"</span>)</a> -<a class="sourceLine" id="cb722-10" data-line-number="10"> <span class="kw">return</span>(aucObj<span class="op">@</span>y.values[[<span class="dv">1</span>]])</a> -<a class="sourceLine" id="cb722-11" data-line-number="11">}</a></code></pre></div> +<div class="sourceCode" id="cb805"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb805-1" data-line-number="1">DE_Quality_AUC <-<span class="st"> </span><span class="cf">function</span>(pVals) {</a> +<a class="sourceLine" id="cb805-2" data-line-number="2"> pVals <-<span class="st"> </span>pVals[<span class="kw">names</span>(pVals) <span class="op">%in%</span><span class="st"> </span>GroundTruth<span class="op">$</span>DE <span class="op">|</span><span class="st"> </span></a> +<a class="sourceLine" id="cb805-3" data-line-number="3"><span class="st"> </span><span class="kw">names</span>(pVals) <span class="op">%in%</span><span class="st"> </span>GroundTruth<span class="op">$</span>notDE]</a> +<a class="sourceLine" id="cb805-4" data-line-number="4"> truth <-<span class="st"> </span><span class="kw">rep</span>(<span class="dv">1</span>, <span class="dt">times =</span> <span class="kw">length</span>(pVals));</a> +<a class="sourceLine" id="cb805-5" data-line-number="5"> truth[<span class="kw">names</span>(pVals) <span class="op">%in%</span><span class="st"> </span>GroundTruth<span class="op">$</span>DE] =<span class="st"> </span><span class="dv">0</span>;</a> +<a class="sourceLine" id="cb805-6" data-line-number="6"> pred <-<span class="st"> </span>ROCR<span class="op">::</span><span class="kw">prediction</span>(pVals, truth)</a> +<a class="sourceLine" id="cb805-7" data-line-number="7"> perf <-<span class="st"> </span>ROCR<span class="op">::</span><span class="kw">performance</span>(pred, <span class="st">"tpr"</span>, <span class="st">"fpr"</span>)</a> +<a class="sourceLine" id="cb805-8" data-line-number="8"> ROCR<span class="op">::</span><span class="kw">plot</span>(perf)</a> +<a class="sourceLine" id="cb805-9" data-line-number="9"> aucObj <-<span class="st"> </span>ROCR<span class="op">::</span><span class="kw">performance</span>(pred, <span class="st">"auc"</span>)</a> +<a class="sourceLine" id="cb805-10" data-line-number="10"> <span class="kw">return</span>(aucObj<span class="op">@</span>y.values[[<span class="dv">1</span>]])</a> +<a class="sourceLine" id="cb805-11" data-line-number="11">}</a></code></pre></div> </div> </div> <div id="wilcoxmann-whitney-u-test" class="section level3"> <h3><span class="header-section-number">12.2.3</span> Wilcox/Mann-Whitney-U Test</h3> <p>The Wilcox-rank-sum test is another non-parametric test, but tests specifically if values in one group are greater/less than the values in the other group. Thus it is often considered a test for difference in median expression between two groups; whereas the KS-test is sensitive to any change in distribution of expression values.</p> -<div class="sourceCode" id="cb723"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb723-1" data-line-number="1">pVals <-<span class="st"> </span><span class="kw">apply</span>(</a> -<a class="sourceLine" id="cb723-2" data-line-number="2"> norm, <span class="dv">1</span>, <span class="cf">function</span>(x) {</a> -<a class="sourceLine" id="cb723-3" data-line-number="3"> <span class="kw">wilcox.test</span>(</a> -<a class="sourceLine" id="cb723-4" data-line-number="4"> x[group <span class="op">==</span><span class="st"> "NA19101"</span>], </a> -<a class="sourceLine" id="cb723-5" data-line-number="5"> x[group <span class="op">==</span><span class="st"> "NA19239"</span>]</a> -<a class="sourceLine" id="cb723-6" data-line-number="6"> )<span class="op">$</span>p.value</a> -<a class="sourceLine" id="cb723-7" data-line-number="7"> }</a> -<a class="sourceLine" id="cb723-8" data-line-number="8">)</a> -<a class="sourceLine" id="cb723-9" data-line-number="9"><span class="co"># multiple testing correction</span></a> -<a class="sourceLine" id="cb723-10" data-line-number="10">pVals <-<span class="st"> </span><span class="kw">p.adjust</span>(pVals, <span class="dt">method =</span> <span class="st">"fdr"</span>)</a> -<a class="sourceLine" id="cb723-11" data-line-number="11"><span class="kw">DE_Quality_AUC</span>(pVals)</a></code></pre></div> +<div class="sourceCode" id="cb806"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb806-1" data-line-number="1">pVals <-<span class="st"> </span><span class="kw">apply</span>(</a> +<a class="sourceLine" id="cb806-2" data-line-number="2"> norm, <span class="dv">1</span>, <span class="cf">function</span>(x) {</a> +<a class="sourceLine" id="cb806-3" data-line-number="3"> <span class="kw">wilcox.test</span>(</a> +<a class="sourceLine" id="cb806-4" data-line-number="4"> x[group <span class="op">==</span><span class="st"> "NA19101"</span>], </a> +<a class="sourceLine" id="cb806-5" data-line-number="5"> x[group <span class="op">==</span><span class="st"> "NA19239"</span>]</a> +<a class="sourceLine" id="cb806-6" data-line-number="6"> )<span class="op">$</span>p.value</a> +<a class="sourceLine" id="cb806-7" data-line-number="7"> }</a> +<a class="sourceLine" id="cb806-8" data-line-number="8">)</a> +<a class="sourceLine" id="cb806-9" data-line-number="9"><span class="co"># multiple testing correction</span></a> +<a class="sourceLine" id="cb806-10" data-line-number="10">pVals <-<span class="st"> </span><span class="kw">p.adjust</span>(pVals, <span class="dt">method =</span> <span class="st">"fdr"</span>)</a> +<a class="sourceLine" id="cb806-11" data-line-number="11"><span class="kw">DE_Quality_AUC</span>(pVals)</a></code></pre></div> </div> <div id="edger" class="section level3"> <h3><span class="header-section-number">12.2.4</span> edgeR</h3> <p>We’ve already used edgeR for differential expression in Chapter -<a href="normalization-confounders-and-batch-correction.html#dealing-with-confounders">7.6</a>. edgeR is based on a negative binomial model of +<a href="#dealing-with-confounders"><strong>??</strong></a>. edgeR is based on a negative binomial model of gene expression and uses a generalized linear model (GLM) framework, the enables us to include other factors such as batch to the model.</p> -<div class="sourceCode" id="cb724"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb724-1" data-line-number="1">dge <-<span class="st"> </span><span class="kw">DGEList</span>(</a> -<a class="sourceLine" id="cb724-2" data-line-number="2"> <span class="dt">counts =</span> counts, </a> -<a class="sourceLine" id="cb724-3" data-line-number="3"> <span class="dt">norm.factors =</span> <span class="kw">rep</span>(<span class="dv">1</span>, <span class="kw">length</span>(counts[<span class="dv">1</span>,])), </a> -<a class="sourceLine" id="cb724-4" data-line-number="4"> <span class="dt">group =</span> group</a> -<a class="sourceLine" id="cb724-5" data-line-number="5">)</a> -<a class="sourceLine" id="cb724-6" data-line-number="6">group_edgeR <-<span class="st"> </span><span class="kw">factor</span>(group)</a> -<a class="sourceLine" id="cb724-7" data-line-number="7">design <-<span class="st"> </span><span class="kw">model.matrix</span>(<span class="op">~</span><span class="st"> </span>group_edgeR)</a> -<a class="sourceLine" id="cb724-8" data-line-number="8">dge <-<span class="st"> </span><span class="kw">estimateDisp</span>(dge, <span class="dt">design =</span> design, <span class="dt">trend.method =</span> <span class="st">"none"</span>)</a> -<a class="sourceLine" id="cb724-9" data-line-number="9">fit <-<span class="st"> </span><span class="kw">glmFit</span>(dge, design)</a> -<a class="sourceLine" id="cb724-10" data-line-number="10">res <-<span class="st"> </span><span class="kw">glmLRT</span>(fit)</a> -<a class="sourceLine" id="cb724-11" data-line-number="11">pVals <-<span class="st"> </span>res<span class="op">$</span>table[,<span class="dv">4</span>]</a> -<a class="sourceLine" id="cb724-12" data-line-number="12"><span class="kw">names</span>(pVals) <-<span class="st"> </span><span class="kw">rownames</span>(res<span class="op">$</span>table)</a> -<a class="sourceLine" id="cb724-13" data-line-number="13"></a> -<a class="sourceLine" id="cb724-14" data-line-number="14">pVals <-<span class="st"> </span><span class="kw">p.adjust</span>(pVals, <span class="dt">method =</span> <span class="st">"fdr"</span>)</a> -<a class="sourceLine" id="cb724-15" data-line-number="15"><span class="kw">DE_Quality_AUC</span>(pVals)</a></code></pre></div> +<div class="sourceCode" id="cb807"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb807-1" data-line-number="1">dge <-<span class="st"> </span><span class="kw">DGEList</span>(</a> +<a class="sourceLine" id="cb807-2" data-line-number="2"> <span class="dt">counts =</span> counts, </a> +<a class="sourceLine" id="cb807-3" data-line-number="3"> <span class="dt">norm.factors =</span> <span class="kw">rep</span>(<span class="dv">1</span>, <span class="kw">length</span>(counts[<span class="dv">1</span>,])), </a> +<a class="sourceLine" id="cb807-4" data-line-number="4"> <span class="dt">group =</span> group</a> +<a class="sourceLine" id="cb807-5" data-line-number="5">)</a> +<a class="sourceLine" id="cb807-6" data-line-number="6">group_edgeR <-<span class="st"> </span><span class="kw">factor</span>(group)</a> +<a class="sourceLine" id="cb807-7" data-line-number="7">design <-<span class="st"> </span><span class="kw">model.matrix</span>(<span class="op">~</span><span class="st"> </span>group_edgeR)</a> +<a class="sourceLine" id="cb807-8" data-line-number="8">dge <-<span class="st"> </span><span class="kw">estimateDisp</span>(dge, <span class="dt">design =</span> design, <span class="dt">trend.method =</span> <span class="st">"none"</span>)</a> +<a class="sourceLine" id="cb807-9" data-line-number="9">fit <-<span class="st"> </span><span class="kw">glmFit</span>(dge, design)</a> +<a class="sourceLine" id="cb807-10" data-line-number="10">res <-<span class="st"> </span><span class="kw">glmLRT</span>(fit)</a> +<a class="sourceLine" id="cb807-11" data-line-number="11">pVals <-<span class="st"> </span>res<span class="op">$</span>table[,<span class="dv">4</span>]</a> +<a class="sourceLine" id="cb807-12" data-line-number="12"><span class="kw">names</span>(pVals) <-<span class="st"> </span><span class="kw">rownames</span>(res<span class="op">$</span>table)</a> +<a class="sourceLine" id="cb807-13" data-line-number="13"></a> +<a class="sourceLine" id="cb807-14" data-line-number="14">pVals <-<span class="st"> </span><span class="kw">p.adjust</span>(pVals, <span class="dt">method =</span> <span class="st">"fdr"</span>)</a> +<a class="sourceLine" id="cb807-15" data-line-number="15"><span class="kw">DE_Quality_AUC</span>(pVals)</a></code></pre></div> </div> <div id="mast" class="section level3"> <h3><span class="header-section-number">12.2.5</span> MAST</h3> @@ -749,27 +762,27 @@ on a zero-inflated negative binomial model. It tests for differential expression using a hurdle model to combine tests of discrete (0 vs not zero) and continuous (non-zero values) aspects of gene expression. Again this uses a linear modelling framework to enable complex models to be considered.</p> -<div class="sourceCode" id="cb725"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb725-1" data-line-number="1">log_counts <-<span class="st"> </span><span class="kw">log</span>(counts <span class="op">+</span><span class="st"> </span><span class="dv">1</span>) <span class="op">/</span><span class="st"> </span><span class="kw">log</span>(<span class="dv">2</span>)</a> -<a class="sourceLine" id="cb725-2" data-line-number="2">fData <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="dt">names =</span> <span class="kw">rownames</span>(log_counts))</a> -<a class="sourceLine" id="cb725-3" data-line-number="3"><span class="kw">rownames</span>(fData) <-<span class="st"> </span><span class="kw">rownames</span>(log_counts);</a> -<a class="sourceLine" id="cb725-4" data-line-number="4">cData <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="dt">cond =</span> group)</a> -<a class="sourceLine" id="cb725-5" data-line-number="5"><span class="kw">rownames</span>(cData) <-<span class="st"> </span><span class="kw">colnames</span>(log_counts)</a> -<a class="sourceLine" id="cb725-6" data-line-number="6"></a> -<a class="sourceLine" id="cb725-7" data-line-number="7">obj <-<span class="st"> </span><span class="kw">FromMatrix</span>(<span class="kw">as.matrix</span>(log_counts), cData, fData)</a> -<a class="sourceLine" id="cb725-8" data-line-number="8"><span class="kw">colData</span>(obj)<span class="op">$</span>cngeneson <-<span class="st"> </span><span class="kw">scale</span>(<span class="kw">colSums</span>(<span class="kw">assay</span>(obj) <span class="op">></span><span class="st"> </span><span class="dv">0</span>))</a> -<a class="sourceLine" id="cb725-9" data-line-number="9">cond <-<span class="st"> </span><span class="kw">factor</span>(<span class="kw">colData</span>(obj)<span class="op">$</span>cond)</a> -<a class="sourceLine" id="cb725-10" data-line-number="10"></a> -<a class="sourceLine" id="cb725-11" data-line-number="11"><span class="co"># Model expression as function of condition & number of detected genes</span></a> -<a class="sourceLine" id="cb725-12" data-line-number="12">zlmCond <-<span class="st"> </span><span class="kw">zlm.SingleCellAssay</span>(<span class="op">~</span><span class="st"> </span>cond <span class="op">+</span><span class="st"> </span>cngeneson, obj) </a> -<a class="sourceLine" id="cb725-13" data-line-number="13"></a> -<a class="sourceLine" id="cb725-14" data-line-number="14">summaryCond <-<span class="st"> </span><span class="kw">summary</span>(zlmCond, <span class="dt">doLRT =</span> <span class="st">"condNA19101"</span>)</a> -<a class="sourceLine" id="cb725-15" data-line-number="15">summaryDt <-<span class="st"> </span>summaryCond<span class="op">$</span>datatable</a> -<a class="sourceLine" id="cb725-16" data-line-number="16"></a> -<a class="sourceLine" id="cb725-17" data-line-number="17">summaryDt <-<span class="st"> </span><span class="kw">as.data.frame</span>(summaryDt)</a> -<a class="sourceLine" id="cb725-18" data-line-number="18">pVals <-<span class="st"> </span><span class="kw">unlist</span>(summaryDt[summaryDt<span class="op">$</span>component <span class="op">==</span><span class="st"> "H"</span>,<span class="dv">4</span>]) <span class="co"># H = hurdle model</span></a> -<a class="sourceLine" id="cb725-19" data-line-number="19"><span class="kw">names</span>(pVals) <-<span class="st"> </span><span class="kw">unlist</span>(summaryDt[summaryDt<span class="op">$</span>component <span class="op">==</span><span class="st"> "H"</span>,<span class="dv">1</span>])</a> -<a class="sourceLine" id="cb725-20" data-line-number="20">pVals <-<span class="st"> </span><span class="kw">p.adjust</span>(pVals, <span class="dt">method =</span> <span class="st">"fdr"</span>)</a> -<a class="sourceLine" id="cb725-21" data-line-number="21"><span class="kw">DE_Quality_AUC</span>(pVals)</a></code></pre></div> +<div class="sourceCode" id="cb808"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb808-1" data-line-number="1">log_counts <-<span class="st"> </span><span class="kw">log</span>(counts <span class="op">+</span><span class="st"> </span><span class="dv">1</span>) <span class="op">/</span><span class="st"> </span><span class="kw">log</span>(<span class="dv">2</span>)</a> +<a class="sourceLine" id="cb808-2" data-line-number="2">fData <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="dt">names =</span> <span class="kw">rownames</span>(log_counts))</a> +<a class="sourceLine" id="cb808-3" data-line-number="3"><span class="kw">rownames</span>(fData) <-<span class="st"> </span><span class="kw">rownames</span>(log_counts);</a> +<a class="sourceLine" id="cb808-4" data-line-number="4">cData <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="dt">cond =</span> group)</a> +<a class="sourceLine" id="cb808-5" data-line-number="5"><span class="kw">rownames</span>(cData) <-<span class="st"> </span><span class="kw">colnames</span>(log_counts)</a> +<a class="sourceLine" id="cb808-6" data-line-number="6"></a> +<a class="sourceLine" id="cb808-7" data-line-number="7">obj <-<span class="st"> </span><span class="kw">FromMatrix</span>(<span class="kw">as.matrix</span>(log_counts), cData, fData)</a> +<a class="sourceLine" id="cb808-8" data-line-number="8"><span class="kw">colData</span>(obj)<span class="op">$</span>cngeneson <-<span class="st"> </span><span class="kw">scale</span>(<span class="kw">colSums</span>(<span class="kw">assay</span>(obj) <span class="op">></span><span class="st"> </span><span class="dv">0</span>))</a> +<a class="sourceLine" id="cb808-9" data-line-number="9">cond <-<span class="st"> </span><span class="kw">factor</span>(<span class="kw">colData</span>(obj)<span class="op">$</span>cond)</a> +<a class="sourceLine" id="cb808-10" data-line-number="10"></a> +<a class="sourceLine" id="cb808-11" data-line-number="11"><span class="co"># Model expression as function of condition & number of detected genes</span></a> +<a class="sourceLine" id="cb808-12" data-line-number="12">zlmCond <-<span class="st"> </span><span class="kw">zlm.SingleCellAssay</span>(<span class="op">~</span><span class="st"> </span>cond <span class="op">+</span><span class="st"> </span>cngeneson, obj) </a> +<a class="sourceLine" id="cb808-13" data-line-number="13"></a> +<a class="sourceLine" id="cb808-14" data-line-number="14">summaryCond <-<span class="st"> </span><span class="kw">summary</span>(zlmCond, <span class="dt">doLRT =</span> <span class="st">"condNA19101"</span>)</a> +<a class="sourceLine" id="cb808-15" data-line-number="15">summaryDt <-<span class="st"> </span>summaryCond<span class="op">$</span>datatable</a> +<a class="sourceLine" id="cb808-16" data-line-number="16"></a> +<a class="sourceLine" id="cb808-17" data-line-number="17">summaryDt <-<span class="st"> </span><span class="kw">as.data.frame</span>(summaryDt)</a> +<a class="sourceLine" id="cb808-18" data-line-number="18">pVals <-<span class="st"> </span><span class="kw">unlist</span>(summaryDt[summaryDt<span class="op">$</span>component <span class="op">==</span><span class="st"> "H"</span>,<span class="dv">4</span>]) <span class="co"># H = hurdle model</span></a> +<a class="sourceLine" id="cb808-19" data-line-number="19"><span class="kw">names</span>(pVals) <-<span class="st"> </span><span class="kw">unlist</span>(summaryDt[summaryDt<span class="op">$</span>component <span class="op">==</span><span class="st"> "H"</span>,<span class="dv">1</span>])</a> +<a class="sourceLine" id="cb808-20" data-line-number="20">pVals <-<span class="st"> </span><span class="kw">p.adjust</span>(pVals, <span class="dt">method =</span> <span class="st">"fdr"</span>)</a> +<a class="sourceLine" id="cb808-21" data-line-number="21"><span class="kw">DE_Quality_AUC</span>(pVals)</a></code></pre></div> </div> <div id="limma" class="section level3"> <h3><span class="header-section-number">12.2.6</span> limma</h3> diff --git a/public/exprs-norm-reads.md b/public/exprs-norm-reads.md index 32a37960d632f3009517d86a9531e3c2cb5834c1..0e14a6e760535e54d3a0178ae718db41747de00a 100644 --- a/public/exprs-norm-reads.md +++ b/public/exprs-norm-reads.md @@ -552,7 +552,7 @@ output: html_document ``` ``` -## Wall clock passed: Time difference of 16.19305 secs +## Wall clock passed: Time difference of 16.15881 secs ``` ``` diff --git a/public/exprs-norm.md b/public/exprs-norm.md index 15c6d765f7cf66989c010f5814f26e2305898e4d..7002395d094b6fc972857090bbac2afb7cdf55d8 100644 --- a/public/exprs-norm.md +++ b/public/exprs-norm.md @@ -483,3058 +483,80 @@ in the `logcounts` slot of the `SingleCellExperiment` object. qclust <- quickCluster(umi.qc, min.size = 30, use.ranks = FALSE) umi.qc <- computeSumFactors(umi.qc, sizes = 15, clusters = qclust) umi.qc <- normalize(umi.qc) -``` - -``` -## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its -## own size factors -``` - -``` -## Warning in .get_all_sf_sets(object): spike-in set 'MT' should have its own -## size factors -``` - -```r -plotPCA( - umi.qc[endog_genes, ], - colour_by = "batch", - size_by = "total_features_by_counts", - shape_by = "individual" -) + ggtitle("PCA plot: scran size-factor normalization") -``` - -<div class="figure" style="text-align: center"> -<img src="exprs-norm_files/figure-html/norm-pca-lsf-1.png" alt="PCA plot of the tung data after LSF normalisation" width="90%" /> -<p class="caption">(\#fig:norm-pca-lsf)PCA plot of the tung data after LSF normalisation</p> -</div> - - -```r -plotRLE( - umi.qc[endog_genes, ], - exprs_values = "logcounts", - colour_by = "batch" -) + ggtitle("RLE plot: scran size-factor normalization") -``` - -<div class="figure" style="text-align: center"> -<img src="exprs-norm_files/figure-html/norm-ours-rle-scran-1.png" alt="Cell-wise RLE of the tung data" width="90%" /> -<p class="caption">(\#fig:norm-ours-rle-scran)Cell-wise RLE of the tung data</p> -</div> - -`scran` sometimes calculates negative or zero size factors. These will -completely distort the normalized expression matrix. We can check the size -factors scran has computed like so: - - - -```r -summary(sizeFactors(umi.qc)) -``` - -``` -## Min. 1st Qu. Median Mean 3rd Qu. Max. -## 0.4836 0.7747 0.9532 1.0000 1.1483 3.2873 -``` - -For this dataset all the size factors are reasonable so we are done. If you find -scran has calculated negative size factors try increasing the cluster and pool -sizes until they are all positive. - -We sometimes filter out cells with very large size-factors (you may like to -think about why), but we will not demonstrate that here. - - -### sctransform - -The `sctransform` approach to using Pearson residuals from an regularized -negative binomial generalized linear model was introduced above. Here we -demonstrate how to apply this method. - -Note that (due to what looks like a bug in this version of `sctransform`) we -need to convert the UMI count matrix to a sparse format to apply sctransform. - - - -```r -umi_sparse <- as(counts(umi.qc), "dgCMatrix") -### Genes expressed in at least 5 cells will be kept -sctnorm_data <- sctransform::vst(umi = umi_sparse, min_cells = 1, - cell_attr = as.data.frame(colData(umi.qc)), - latent_var = "log10_total_counts_endogenous") -``` - -``` -## Calculating cell attributes for input UMI matrix -``` - -``` -## Variance stabilizing transformation of count matrix of size 14066 by 657 -``` - -``` -## Model formula is y ~ log10_total_counts_endogenous -``` - -``` -## Get Negative Binomial regression parameters per gene -``` - -``` -## Using 2000 genes, 657 cells -``` - -``` -## | | | 0% -``` - -``` -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -``` - -``` -## | |======== | 12% -``` - -``` -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -``` - -``` -## | |================ | 25% -``` - -``` -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -``` - -``` -## | |======================== | 38% -``` - -``` -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -``` - -``` -## | |================================ | 50% -``` - -``` -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -``` - -``` -## | |========================================= | 62% -``` - -``` -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -``` - -``` -## | |================================================= | 75% -``` - -``` -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +plotPCA( + umi.qc[endog_genes, ], + colour_by = "batch", + size_by = "total_features_by_counts", + shape_by = "individual" +) + ggtitle("PCA plot: scran size-factor normalization") +``` -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +<div class="figure" style="text-align: center"> +<img src="exprs-norm_files/figure-html/norm-pca-lsf-1.png" alt="PCA plot of the tung data after LSF normalisation" width="90%" /> +<p class="caption">(\#fig:norm-pca-lsf)PCA plot of the tung data after LSF normalisation</p> +</div> -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +```r +plotRLE( + umi.qc[endog_genes, ], + exprs_values = "logcounts", + colour_by = "batch" +) + ggtitle("RLE plot: scran size-factor normalization") +``` -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +<div class="figure" style="text-align: center"> +<img src="exprs-norm_files/figure-html/norm-ours-rle-scran-1.png" alt="Cell-wise RLE of the tung data" width="90%" /> +<p class="caption">(\#fig:norm-ours-rle-scran)Cell-wise RLE of the tung data</p> +</div> -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +`scran` sometimes calculates negative or zero size factors. These will +completely distort the normalized expression matrix. We can check the size +factors scran has computed like so: -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +```r +summary(sizeFactors(umi.qc)) ``` ``` -## | |========================================================= | 88% -``` - +## Min. 1st Qu. Median Mean 3rd Qu. Max. +## 0.4836 0.7747 0.9532 1.0000 1.1483 3.2873 ``` -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +For this dataset all the size factors are reasonable so we are done. If you find +scran has calculated negative size factors try increasing the cluster and pool +sizes until they are all positive. -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +We sometimes filter out cells with very large size-factors (you may like to +think about why), but we will not demonstrate that here. -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +### sctransform -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +The `sctransform` approach to using Pearson residuals from an regularized +negative binomial generalized linear model was introduced above. Here we +demonstrate how to apply this method. -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -``` +Note that (due to what looks like a bug in this version of `sctransform`) we +need to convert the UMI count matrix to a sparse format to apply sctransform. -``` -## | |=================================================================| 100% -``` -``` -## Found 5 outliers - those will be ignored in fitting/regularization step -``` -``` -## Second step: Get residuals using fitted parameters for 14066 genes +```r +umi_sparse <- as(counts(umi.qc), "dgCMatrix") +### Genes expressed in at least 5 cells will be kept +sctnorm_data <- sctransform::vst(umi = umi_sparse, min_cells = 1, + cell_attr = as.data.frame(colData(umi.qc)), + latent_var = "log10_total_counts_endogenous") ``` ``` +## | | | 0% | |======== | 12% | |================ | 25% | |======================== | 38% | |================================ | 50% | |========================================= | 62% | |================================================= | 75% | |========================================================= | 88% | |=================================================================| 100% ## | | | 0% | |= | 2% | |== | 4% | |==== | 5% | |===== | 7% | |====== | 9% | |======= | 11% | |======== | 13% | |========= | 15% | |=========== | 16% | |============ | 18% | |============= | 20% | |============== | 22% | |=============== | 24% | |================= | 25% | |================== | 27% | |=================== | 29% | |==================== | 31% | |===================== | 33% | |====================== | 35% | |======================== | 36% | |========================= | 38% | |========================== | 40% | |=========================== | 42% | |============================ | 44% | |============================== | 45% | |=============================== | 47% | |================================ | 49% | |================================= | 51% | |================================== | 53% | |=================================== | 55% | |===================================== | 56% | |====================================== | 58% | |======================================= | 60% | |======================================== | 62% | |========================================= | 64% | |=========================================== | 65% | |============================================ | 67% | |============================================= | 69% | |============================================== | 71% | |=============================================== | 73% | |================================================ | 75% | |================================================== | 76% | |=================================================== | 78% | |==================================================== | 80% | |===================================================== | 82% | |====================================================== | 84% | |======================================================== | 85% | |========================================================= | 87% | |========================================================== | 89% | |=========================================================== | 91% | |============================================================ | 93% | |============================================================= | 95% | |=============================================================== | 96% | |================================================================ | 98% | |=================================================================| 100% ``` -``` -## Calculating gene attributes -``` - -``` -## Wall clock passed: Time difference of 21.43582 secs -``` - ```r ## Pearson residuals, or deviance residuals dim(sctnorm_data$y) @@ -3708,11 +730,6 @@ eff_length <- plot(eff_length, rowMeans(counts(umi.qc.ann))) ``` -<div class="figure" style="text-align: center"> -<img src="exprs-norm_files/figure-html/length-vs-mean-1.png" alt="Gene length vs Mean Expression for the raw data" width="90%" /> -<p class="caption">(\#fig:length-vs-mean)Gene length vs Mean Expression for the raw data</p> -</div> - There is no relationship between gene length and mean expression so __FPKM__s & __TPM__s are inappropriate for this dataset. This is what we would expect for UMI protocols that tag one end of the transcript. But we will demonstrate them @@ -3746,26 +763,11 @@ plotPCA( ) ``` -<div class="figure" style="text-align: center"> -<img src="exprs-norm_files/figure-html/norm-pca-fpkm-1.png" alt="PCA plot of the tung data after TPM normalisation" width="90%" /> -<p class="caption">(\#fig:norm-pca-fpkm)PCA plot of the tung data after TPM normalisation</p> -</div> - ```r tpm(umi.qc.ann) <- log2(calculateFPKM(umi.qc.ann, eff_length) + 1) ``` -``` -## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its -## own size factors -``` - -``` -## Warning in .get_all_sf_sets(object): spike-in set 'MT' should have its own -## size factors -``` - ```r tmp <- runPCA( @@ -3780,11 +782,6 @@ plotPCA( ) ``` -<div class="figure" style="text-align: center"> -<img src="exprs-norm_files/figure-html/norm-pca-tpm-1.png" alt="PCA plot of the tung data after FPKM normalisation" width="90%" /> -<p class="caption">(\#fig:norm-pca-tpm)PCA plot of the tung data after FPKM normalisation</p> -</div> - __Note__ The `PCA` looks for differences between cells. Gene length is the same across cells for each gene thus __FPKM__ is almost identical to the __CPM__ plot (it is just rotated) since it performs __CPM__ first then normalizes gene @@ -3840,52 +837,43 @@ please compare your results to ours (next chapter). ## [15] scRNA.seq.funcs_0.1.0 ## ## loaded via a namespace (and not attached): -## [1] bitops_1.0-6 bit64_0.9-7 -## [3] httr_1.4.1 progress_1.2.2 -## [5] dynamicTreeCut_1.63-1 backports_1.1.4 -## [7] sctransform_0.2.0 tools_3.6.0 -## [9] R6_2.4.0 irlba_2.3.3 -## [11] hypergeo_1.2-13 vipor_0.4.5 -## [13] DBI_1.0.0 lazyeval_0.2.2 -## [15] colorspace_1.4-1 withr_2.1.2 -## [17] prettyunits_1.0.2 tidyselect_0.2.5 -## [19] gridExtra_2.3 moments_0.14 -## [21] curl_4.2 bit_1.1-14 -## [23] compiler_3.6.0 orthopolynom_1.0-5 -## [25] BiocNeighbors_1.2.0 labeling_0.3 -## [27] bookdown_0.13 scales_1.0.0 -## [29] stringr_1.4.0 digest_0.6.21 -## [31] rmarkdown_1.15 XVector_0.24.0 -## [33] pkgconfig_2.0.3 htmltools_0.3.6 -## [35] limma_3.40.6 highr_0.8 -## [37] rlang_0.4.0 RSQLite_2.1.2 -## [39] DelayedMatrixStats_1.6.1 dplyr_0.8.3 -## [41] RCurl_1.95-4.12 magrittr_1.5 -## [43] BiocSingular_1.0.0 GenomeInfoDbData_1.2.1 -## [45] Matrix_1.2-17 Rcpp_1.0.2 -## [47] ggbeeswarm_0.6.0 munsell_0.5.0 -## [49] viridis_0.5.1 stringi_1.4.3 -## [51] yaml_2.2.0 edgeR_3.26.8 -## [53] MASS_7.3-51.1 zlibbioc_1.30.0 -## [55] Rtsne_0.15 plyr_1.8.4 -## [57] blob_1.2.0 grid_3.6.0 -## [59] listenv_0.7.0 dqrng_0.2.1 -## [61] crayon_1.3.4 contfrac_1.1-12 -## [63] lattice_0.20-38 cowplot_1.0.0 -## [65] hms_0.5.1 locfit_1.5-9.1 -## [67] zeallot_0.1.0 knitr_1.25 -## [69] pillar_1.4.2 igraph_1.2.4.1 -## [71] future.apply_1.3.0 reshape2_1.4.3 -## [73] codetools_0.2-16 biomaRt_2.40.4 -## [75] XML_3.98-1.20 glue_1.3.1 -## [77] evaluate_0.14 deSolve_1.24 -## [79] vctrs_0.2.0 gtable_0.3.0 -## [81] purrr_0.3.2 future_1.14.0 -## [83] assertthat_0.2.1 xfun_0.9 -## [85] rsvd_1.0.2 viridisLite_0.3.0 -## [87] tibble_2.1.3 elliptic_1.4-0 -## [89] memoise_1.1.0 AnnotationDbi_1.46.1 -## [91] beeswarm_0.2.3 globals_0.12.4 -## [93] statmod_1.4.32 +## [1] viridis_0.5.1 dynamicTreeCut_1.63-1 +## [3] edgeR_3.26.8 BiocSingular_1.0.0 +## [5] viridisLite_0.3.0 DelayedMatrixStats_1.6.1 +## [7] elliptic_1.4-0 moments_0.14 +## [9] assertthat_0.2.1 statmod_1.4.32 +## [11] highr_0.8 dqrng_0.2.1 +## [13] GenomeInfoDbData_1.2.1 vipor_0.4.5 +## [15] yaml_2.2.0 globals_0.12.4 +## [17] pillar_1.4.2 lattice_0.20-38 +## [19] glue_1.3.1 limma_3.40.6 +## [21] digest_0.6.21 XVector_0.24.0 +## [23] colorspace_1.4-1 plyr_1.8.4 +## [25] cowplot_1.0.0 htmltools_0.3.6 +## [27] Matrix_1.2-17 pkgconfig_2.0.3 +## [29] listenv_0.7.0 bookdown_0.13 +## [31] zlibbioc_1.30.0 purrr_0.3.2 +## [33] scales_1.0.0 Rtsne_0.15 +## [35] tibble_2.1.3 withr_2.1.2 +## [37] lazyeval_0.2.2 magrittr_1.5 +## [39] crayon_1.3.4 evaluate_0.14 +## [41] future_1.14.0 MASS_7.3-51.1 +## [43] beeswarm_0.2.3 tools_3.6.0 +## [45] stringr_1.4.0 locfit_1.5-9.1 +## [47] munsell_0.5.0 irlba_2.3.3 +## [49] orthopolynom_1.0-5 compiler_3.6.0 +## [51] rsvd_1.0.2 contfrac_1.1-12 +## [53] rlang_0.4.0 grid_3.6.0 +## [55] RCurl_1.95-4.12 BiocNeighbors_1.2.0 +## [57] igraph_1.2.4.1 labeling_0.3 +## [59] bitops_1.0-6 rmarkdown_1.15 +## [61] codetools_0.2-16 hypergeo_1.2-13 +## [63] gtable_0.3.0 deSolve_1.24 +## [65] reshape2_1.4.3 R6_2.4.0 +## [67] gridExtra_2.3 knitr_1.25 +## [69] dplyr_0.8.3 future.apply_1.3.0 +## [71] stringi_1.4.3 ggbeeswarm_0.6.0 +## [73] Rcpp_1.0.2 sctransform_0.2.0 +## [75] tidyselect_0.2.5 xfun_0.9 ``` diff --git a/public/exprs-qc.md b/public/exprs-qc.md index cca9df72be0c7ae87f71d08a833ddc6b452e1c11..0b1e6e72f30a12225977a94dd8742947f9e019fc 100644 --- a/public/exprs-qc.md +++ b/public/exprs-qc.md @@ -404,6 +404,22 @@ __Hint__: Use `vennCounts` and `vennDiagram` functions from the [limma](https:// __Answer__ + +```r +library(limma) +auto <- colnames(umi)[umi$outlier] +man <- colnames(umi)[!umi$use] +venn.diag <- vennCounts( + cbind(colnames(umi) %in% auto, + colnames(umi) %in% man) +) +vennDiagram( + venn.diag, + names = c("Automatic", "Manual"), + circle.col = c("blue", "green") +) +``` + <div class="figure" style="text-align: center"> <img src="exprs-qc_files/figure-html/cell-filt-comp-1.png" alt="Comparison of the default, automatic and manual cell filters" width="90%" /> <p class="caption">(\#fig:cell-filt-comp)Comparison of the default, automatic and manual cell filters</p> @@ -417,7 +433,7 @@ one droplet resulting one cell barcode actually containing read information from multiple cells. One way to find doublets/multiplets in the data is to see if there are cells co-expressing markers of distinct cell types. There are also computational tools available for detecting potential doublets in the cells. A -lot of these tools rely on artifical doublets formed from the datasets by +lot of these tools rely on artificial doublets formed from the datasets by randomly joining the expression profiles of two cells. Then the cells are tested against the artificial doublet profiles. @@ -425,7 +441,7 @@ We demonstrate the usage of two of these doublet detection tools. ### scds -`scds` has two detection methods: +`scds`[@Bais2019-hf] has two detection methods: 1) co-expression based; 2) binary-classification based. @@ -435,7 +451,7 @@ estimated based on a binomial model and gene pairs that do not co-expression often get higher scores when they co-expression in some cells. The cells' doublet scores are derived based on the co-expression of pairs of genes. In the binary classification based approach, artificial doublet clusters are generated -and cells are difficult to separte from the artificial doublets get higher +and cells are difficult to separate from the artificial doublets get higher doublet scores. @@ -449,28 +465,49 @@ umi = bcds(umi) ``` ``` -## [1] train-error:0.065830+0.003564 test-error:0.099490+0.021301 +## [1] train-error:0.056712+0.006782 test-error:0.090820+0.022608 ## Multiple eval metrics are present. Will use test_error for early stopping. ## Will train until test_error hasn't improved in 2 rounds. ## -## [2] train-error:0.050057+0.006014 test-error:0.079285+0.013055 -## [3] train-error:0.039643+0.003520 test-error:0.071160+0.011813 -## [4] train-error:0.033855+0.005592 test-error:0.065367+0.015263 -## [5] train-error:0.029079+0.005075 test-error:0.065377+0.010823 -## [6] train-error:0.026621+0.005039 test-error:0.061324+0.010783 -## [7] train-error:0.019819+0.003215 test-error:0.054953+0.011878 -## [8] train-error:0.018662+0.003859 test-error:0.054358+0.014838 -## [9] train-error:0.016493+0.002438 test-error:0.058421+0.010390 -## [10] train-error:0.014901+0.004077 test-error:0.056687+0.009949 +## [2] train-error:0.042102+0.002537 test-error:0.084458+0.011641 +## [3] train-error:0.031539+0.002448 test-error:0.071155+0.009566 +## [4] train-error:0.029224+0.001912 test-error:0.072279+0.017508 +## [5] train-error:0.024595+0.002624 test-error:0.066512+0.016282 +## [6] train-error:0.021412+0.001913 test-error:0.063073+0.009557 +## [7] train-error:0.018373+0.002762 test-error:0.056687+0.016847 +## [8] train-error:0.016636+0.004358 test-error:0.052079+0.011572 +## [9] train-error:0.014466+0.002777 test-error:0.051499+0.008444 +## [10] train-error:0.012731+0.001173 test-error:0.048021+0.010077 +## [11] train-error:0.012586+0.001800 test-error:0.046292+0.011280 +## [12] train-error:0.009692+0.002442 test-error:0.045707+0.009178 +## [13] train-error:0.007957+0.002586 test-error:0.043398+0.007749 +## [14] train-error:0.007378+0.002521 test-error:0.043393+0.009114 +## [15] train-error:0.007668+0.002402 test-error:0.043398+0.008171 +## [16] train-error:0.006944+0.002024 test-error:0.041084+0.009753 +## [17] train-error:0.004919+0.002115 test-error:0.038186+0.008825 +## [18] train-error:0.004774+0.002024 test-error:0.038761+0.008839 +## [19] train-error:0.003906+0.001863 test-error:0.037021+0.008974 +## [20] train-error:0.003038+0.001674 test-error:0.036447+0.008460 +## [21] train-error:0.002604+0.001084 test-error:0.037606+0.009120 +## [22] train-error:0.002604+0.000982 test-error:0.038181+0.010044 ## Stopping. Best iteration: -## [8] train-error:0.018662+0.003859 test-error:0.054358+0.014838 +## [20] train-error:0.003038+0.001674 test-error:0.036447+0.008460 ## -## [1] train-error:0.061921 +## [1] train-error:0.065972 ## Will train until train_error hasn't improved in 2 rounds. ## -## [2] train-error:0.052083 -## [3] train-error:0.039352 -## [4] train-error:0.031829 +## [2] train-error:0.046875 +## [3] train-error:0.030671 +## [4] train-error:0.028356 +## [5] train-error:0.022569 +## [6] train-error:0.021412 +## [7] train-error:0.019676 +## [8] train-error:0.018519 +## [9] train-error:0.016204 +## [10] train-error:0.013310 +## [11] train-error:0.011574 +## [12] train-error:0.009838 +## [13] train-error:0.008102 ``` ```r @@ -483,13 +520,13 @@ head(cbind(CD$cxds_score,CD$bcds_score, CD$hybrid_score)) ``` ``` -## [,1] [,2] [,3] -## NA19098.r1.A01 4131.405 0.05192234 0.2552833 -## NA19098.r1.A02 4564.089 0.03846648 0.2656644 -## NA19098.r1.A03 2827.904 0.03932181 0.1647904 -## NA19098.r1.A04 4708.213 0.04480528 0.2811814 -## NA19098.r1.A05 6134.590 0.03854402 0.3578605 -## NA19098.r1.A06 5810.730 0.03731131 0.3374924 +## [,1] [,2] [,3] +## NA19098.r1.A01 4131.405 0.013268524 0.2493021 +## NA19098.r1.A02 4564.089 0.006372486 0.2676119 +## NA19098.r1.A03 2827.904 0.002598290 0.1619169 +## NA19098.r1.A04 4708.213 0.013077467 0.2829361 +## NA19098.r1.A05 6134.590 0.005533409 0.3588618 +## NA19098.r1.A06 5810.730 0.006969100 0.3413388 ``` ```r diff --git a/public/exprs-qc_files/figure-html/unnamed-chunk-15-1.png b/public/exprs-qc_files/figure-html/unnamed-chunk-15-1.png index 66b9cb17ce7037a72b70895a060850bb506d1a5b..26ad1d58bab8c70bd1e5a8dd1cf438353c00a842 100644 Binary files a/public/exprs-qc_files/figure-html/unnamed-chunk-15-1.png and b/public/exprs-qc_files/figure-html/unnamed-chunk-15-1.png differ diff --git a/public/exprs-qc_files/figure-html/unnamed-chunk-16-1.png b/public/exprs-qc_files/figure-html/unnamed-chunk-16-1.png index 1df19665e89f9028bdb01d284e932a72a165fa31..3fd582342bc3d4a181f10f690e4acb70127c0247 100644 Binary files a/public/exprs-qc_files/figure-html/unnamed-chunk-16-1.png and b/public/exprs-qc_files/figure-html/unnamed-chunk-16-1.png differ diff --git a/public/feature-selection.md b/public/feature-selection.md index c1647bcae06dbec6a019d5793632370d244de1af..ba76ab1117ccc715bf7ac0dcf657e7c8b8e80964 100644 --- a/public/feature-selection.md +++ b/public/feature-selection.md @@ -14,6 +14,9 @@ library(matrixStats) library(M3Drop) library(RColorBrewer) library(SingleCellExperiment) +library(Polychrome) +library(scater) +library(scran) set.seed(1) ``` @@ -46,7 +49,8 @@ For this section we will continue working with the Deng data. ```r deng <- readRDS("data/deng/deng-reads.rds") celltype_labs <- colData(deng)$cell_type2 -cell_colors <- brewer.pal(max(3,length(unique(celltype_labs))), "Set3") +cell_colors <- createPalette(10, c("#010101", "#ff0000"), M=1000) +names(cell_colors) <- unique(as.character(celltype_labs)) ``` Feature selection is performed after QC, however this data has already been QCed so @@ -65,6 +69,10 @@ command below. expr_matrix <- M3Drop::M3DropConvertData(deng) ``` +``` +## [1] "Removing 1134 undetected genes." +``` + This function is compatible with most single-cell RNA-seq analysis packages including: scater, SingleCellExperiment, monocle, and Seurat. It can also convert an existing expression matrix to the correct form (removing undetected genes & normalizing/delogging) @@ -85,12 +93,13 @@ first is to identify genes which behave differently from a null model describing just the technical noise expected in the dataset. If the dataset contains spike-in RNAs they can be used to directly model -technical noise. However, measurements of spike-ins may not experience -the same technical noise as endogenous transcripts [(Svensson et al., 2017)](https://www.nature.com/nmeth/journal/v14/n4/full/nmeth.4220.html). -In addition, scRNASeq experiments often contain only a small number of -spike-ins which reduces our confidence in fitted model parameters. +technical noise. However, measurements of spike-ins may not experience the same +technical noise as endogenous transcripts [(Svensson et al., +2017)](https://www.nature.com/nmeth/journal/v14/n4/full/nmeth.4220.html). In +addition, scRNASeq experiments often contain only a small number of spike-ins +which reduces our confidence in fitted model parameters. -#### Highly Variable Genes +#### Highly Variable Genes - Brennecke method The first method proposed to identify features in scRNASeq datasets was to identify highly variable genes (HVG). HVG assumes that if genes @@ -104,21 +113,23 @@ __Exercise 2__ Using the functions rowMeans and rowVars to plot the relationship between mean expression and variance for all genes in this dataset. (Hint: use log="xy" to plot on a log-scale). +<img src="feature-selection_files/figure-html/unnamed-chunk-6-1.png" width="90%" style="display: block; margin: auto;" /> -A popular method to correct for the relationship between variance and mean expression -was proposed by [Brennecke et al.](http://www.nature.com/nmeth/journal/v10/n11/full/nmeth.2645.html). -To use the Brennecke method, we first normalize for library size then calculate -the mean and the square coefficient of variation (variation divided by -the squared mean expression). A quadratic curve is fit to the relationship -between these two variables for the ERCC spike-in, and then a chi-square test is used to find genes -significantly above the curve. This method is included in the M3Drop package as the -Brennecke_getVariableGenes(counts, spikes) function. However, this dataset does not contain spike-ins -so we will use the entire dataset to estimate the technical noise. +An early method to correct for the relationship between variance and mean +expression was proposed by [Brennecke et +al.](http://www.nature.com/nmeth/journal/v10/n11/full/nmeth.2645.html). To use +the Brennecke method, we first normalize for library size then calculate the +mean and the square coefficient of variation (variation divided by the squared +mean expression). A quadratic curve is fit to the relationship between these two +variables for the ERCC spike-in, and then a chi-square test is used to find +genes significantly above the curve. This method is included in the M3Drop +package as the Brennecke_getVariableGenes(counts, spikes) function. However, +this dataset does not contain spike-ins so we will use the entire dataset to +estimate the technical noise. -In the figure below the red curve -is the fitted technical noise model and the dashed line is the 95% -CI. Pink dots are the genes with significant biological variability -after multiple-testing correction. +In the figure below the red curve is the fitted technical noise model and the +dashed line is the 95% CI. Pink dots are the genes with significant biological +variability after multiple-testing correction. ```r @@ -129,9 +140,12 @@ Brennecke_HVG <- BrenneckeGetVariableGenes( ) ``` -This function returns a matrix of significant genes as well as their estimated effect size (difference -between observed and expected coefficient of variation), and their significance as raw p.values and -FDR corrected q.values. For now we will just keep the names of the significant HVG genes. +<img src="feature-selection_files/figure-html/unnamed-chunk-7-1.png" width="90%" style="display: block; margin: auto;" /> + +This function returns a matrix of significant genes as well as their estimated +effect size (difference between observed and expected coefficient of variation), +and their significance as raw p.values and FDR corrected q.values. For now we +will just keep the names of the significant HVG genes. ```r @@ -142,23 +156,114 @@ __Exercise 3__ How many genes were signifcant using BrenneckeGetVariableGenes? +``` +## [1] 1303 +``` + +#### Highly Variable Genes - simpleSingleCell method + +The Bioconductor +[simpleSingleCell](https://bioconductor.org/packages/release/workflows/html/simpleSingleCell.html) +workflow has a great deal of excellent material to help your analyses. Here, we +show how to identify highly variable genes using functionality from the `scran` +package. + +This method assumes that technical variance is captured by a Poisson +distribution, and that variance beyond that explained by a Poisson distribution +represents biological variance of interest. This approach separates the +biological component of the variance from the technical component and thus can +rank genes based on their "biological" variance. This model also provides +p-values (with FDR adjustment) that can be used to identify the set of +"significant" highly variable genes at a given significance level. + + +```r +### mamke a technical trend of variance based on Poisson +var.fit <- trendVar(deng, parametric=TRUE, loess.args=list(span=0.4), use.spikes = FALSE) +var.out <- decomposeVar(deng, var.fit) +plot(var.out$mean, var.out$total, pch=16, cex=0.6, xlab="Mean log-expression", + ylab="Variance of log-expression") +points(var.out$mean[isSpike(deng)], var.out$total[isSpike(deng)], col="red", pch=16) +curve(var.fit$trend(x), col="dodgerblue", add=TRUE, lwd=2) +``` + +<img src="feature-selection_files/figure-html/hvg-simpleSingleCell-1.png" width="90%" style="display: block; margin: auto;" /> + +```r +chosen.genes <- order(var.out$bio, decreasing=TRUE)[1:10] +plotExpression(deng, rownames(var.out)[chosen.genes], + point_alpha=0.5, jitter_type="jitter") +``` + +<img src="feature-selection_files/figure-html/hvg-simpleSingleCell-2.png" width="90%" style="display: block; margin: auto;" /> + +```r +top.dec <- var.out[order(var.out$bio, decreasing=TRUE),] + # the highly variable genes with largest biological components +head(top.dec) +``` + +``` +## DataFrame with 6 rows and 6 columns +## mean total bio +## <numeric> <numeric> <numeric> +## Obox6 7.0852220910669 39.7469062194493 27.7222625676479 +## BC053393 6.23846872763624 36.7868129334449 22.7409221497424 +## Krt18 8.06957111931139 30.7163256353151 21.3338604240051 +## Upp1 6.70443458808406 32.9196031154138 19.9537242012223 +## Akr1b8 9.31035205790714 25.9351262454146 19.563014227718 +## Spp1 5.52672835522051 34.8140952020968 19.5492807120572 +## tech p.value FDR +## <numeric> <numeric> <numeric> +## Obox6 12.0246436518013 6.67046481158613e-67 4.98750653962295e-64 +## BC053393 14.0458907837025 1.89687518927716e-40 5.90955657926056e-38 +## Krt18 9.38246521130992 1.28064383710762e-65 9.26649093876163e-63 +## Upp1 12.9658789141915 1.39045180596497e-37 3.89865305745004e-35 +## Akr1b8 6.37211201769662 2.70679041028919e-99 5.51963779029062e-96 +## Spp1 15.2648144900397 9.4641203490752e-29 1.76908069625088e-26 +``` + +```r +simplesinglecell_genes <- rownames(top.dec)[top.dec$FDR < 0.001] +table(top.dec$FDR < 0.001) +``` + +``` +## +## FALSE TRUE +## 21124 1307 +``` + +If we set an FDR threshold of 0.1%, this approach identifies around 1300 highly +variable genes. + +The output of this variance modelling can be used as input to a `denoisePCA()` +function to compute "denoised" principal components for clustering and other +downstream analyses (details not shown here; please see the `simpleSingleCell` +workflow). + + #### High Dropout Genes -An alternative to finding HVGs is to identify genes with unexpectedly high numbers of zeros. -The frequency of zeros, known as the "dropout rate", is very closely related to expression level -in scRNASeq data. Zeros are the dominant feature of single-cell RNASeq data, typically accounting -for over half of the entries in the final expression matrix. These zeros predominantly result -from the failure of mRNAs failing to be reversed transcribed [(Andrews and Hemberg, 2016)](http://www.biorxiv.org/content/early/2017/05/25/065094). Reverse transcription -is an enzyme reaction thus can be modelled using the Michaelis-Menten equation: +An alternative to finding HVGs is to identify genes with unexpectedly high +numbers of zeros. The frequency of zeros, known as the "dropout rate", is very +closely related to expression level in scRNASeq data. Zeros are the dominant +feature of single-cell RNASeq data, typically accounting for over half of the +entries in the final expression matrix. These zeros predominantly result from +the failure of mRNAs failing to be reversed transcribed [(Andrews and Hemberg, +2016)](http://www.biorxiv.org/content/early/2017/05/25/065094). Reverse +transcription is an enzyme reaction thus can be modelled using the +Michaelis-Menten equation: $$P_{dropout} = 1 - S/(K + S)$$ -where $S$ is the mRNA concentration in the cell (we will estimate this as average expression) -and $K$ is the Michaelis-Menten constant. +where $S$ is the mRNA concentration in the cell (we will estimate this as +average expression) and $K$ is the Michaelis-Menten constant. -Because the Michaelis-Menten equation is a convex non-linear function, genes which are -differentially expression across two or more populations of cells in our dataset will -be shifted up/right of the Michaelis-Menten model (see Figure below). +Because the Michaelis-Menten equation is a convex non-linear function, genes +which are differentially expression across two or more populations of cells in +our dataset will be shifted up/right of the Michaelis-Menten model (see Figure +below). ```r @@ -194,9 +299,15 @@ points( cex = 3 ) ``` -__Note__: add `log="x"` to the `plot` call above to see how this looks on the log scale, which is used in M3Drop figures. -__Exercise 4__: Produce the same plot as above with different expression levels (S1 & S2) and/or mixtures (mix). +<img src="feature-selection_files/figure-html/unnamed-chunk-10-1.png" width="90%" style="display: block; margin: auto;" /> + +__Note__: add `log="x"` to the `plot` call above to see how this looks on the +log scale, which is used in M3Drop figures. + +__Exercise 4__: Produce the same plot as above with different expression levels +(S1 & S2) and/or mixtures (mix). + We use M3Drop to identify significant outliers to the right of the MM @@ -209,6 +320,11 @@ M3Drop_genes <- M3DropFeatureSelection( mt_method = "fdr", mt_threshold = 0.01 ) +``` + +<img src="feature-selection_files/figure-html/unnamed-chunk-12-1.png" width="90%" style="display: block; margin: auto;" /> + +```r M3Drop_genes <- M3Drop_genes$Gene ``` @@ -235,70 +351,161 @@ by effect size. ```r deng_int <- NBumiConvertData(deng) +``` + +``` +## [1] "Removing 1134 undetected genes." +``` + +```r DANB_fit <- NBumiFitModel(deng_int) # DANB is fit to the raw count matrix # Perform DANB feature selection DropFS <- NBumiFeatureSelectionCombinedDrop(DANB_fit, method="fdr", qval.thresh=0.01, suppress.plot=FALSE) +``` + +<img src="feature-selection_files/figure-html/unnamed-chunk-13-1.png" width="90%" style="display: block; margin: auto;" /> + +```r DANB_genes <- DropFS[1:1500,]$Gene ``` __Exercise 5__ How many genes were signifcant using NBumiFeatureSelectionCombinedDrop? +``` +## [1] 10694 +``` -### Correlated Expression +#### Residual variance from a (regularized) negative binomial model -A completely different approach to feature selection is to use gene-gene correlations. This method -is based on the idea that multiple genes will be differentially expressed between different cell-types -or cell-states. Genes which are expressed in the same cell-population will be positively correlated -with each other where as genes expressed in different cell-populations will be negatively correated with -each other. Thus important genes can be identified by the magnitude of their correlation -with other genes. +In the [normalization chapter](#normalization-theory) we introduced the +`sctransform` approach to using Pearson residuals from an regularized negative +binomial generalized linear model to normalize scRNA-seq data. -The limitation of this method is that it assumes technical noise is random and independent for each cell, -thus shouldn't produce gene-gene correlations, but this assumption is violated by batch effects which are -generally systematic between different experimental batches and will produce gene-gene correlations. As a -result it is more appropriate to take the top few thousand genes as ranked by gene-gene correlation than -consider the significance of the correlations. +The residual variance of genes (i.e. the variance of the Pearson residuals) +provides a way to identify highly variable genes, where the "variance" is +decoupled from the average level of expression of the gene. +The residual variance is easily accessible from the `sctransform` output as we +show below. + +First, we run `sctransform` as we did previously. ```r -cor_feat <- M3Drop::corFS(expr_matrix) -Cor_genes <- names(cor_feat)[1:1500] +deng_sparse <- as(counts(deng), "dgCMatrix") +### Genes expressed in at least 5 cells will be kept +sctnorm_data <- sctransform::vst(umi = deng_sparse, min_cells = 1, + cell_attr = as.data.frame(colData(deng)), + latent_var = "log10_total_counts_endogenous") +``` + +``` +## | | | 0% | |======== | 12% | |================ | 25% | |======================== | 38% | |================================ | 50% | |========================================= | 62% | |================================================= | 75% | |========================================================= | 88% | |=================================================================| 100% +## | | | 0% | |= | 1% | |== | 2% | |== | 4% | |=== | 5% | |==== | 6% | |===== | 7% | |===== | 8% | |====== | 10% | |======= | 11% | |======== | 12% | |========= | 13% | |========= | 14% | |========== | 15% | |=========== | 17% | |============ | 18% | |============ | 19% | |============= | 20% | |============== | 21% | |=============== | 23% | |=============== | 24% | |================ | 25% | |================= | 26% | |================== | 27% | |=================== | 29% | |=================== | 30% | |==================== | 31% | |===================== | 32% | |====================== | 33% | |====================== | 35% | |======================= | 36% | |======================== | 37% | |========================= | 38% | |========================== | 39% | |========================== | 40% | |=========================== | 42% | |============================ | 43% | |============================= | 44% | |============================= | 45% | |============================== | 46% | |=============================== | 48% | |================================ | 49% | |================================ | 50% | |================================= | 51% | |================================== | 52% | |=================================== | 54% | |==================================== | 55% | |==================================== | 56% | |===================================== | 57% | |====================================== | 58% | |======================================= | 60% | |======================================= | 61% | |======================================== | 62% | |========================================= | 63% | |========================================== | 64% | |=========================================== | 65% | |=========================================== | 67% | |============================================ | 68% | |============================================= | 69% | |============================================== | 70% | |============================================== | 71% | |=============================================== | 73% | |================================================ | 74% | |================================================= | 75% | |================================================== | 76% | |================================================== | 77% | |=================================================== | 79% | |==================================================== | 80% | |===================================================== | 81% | |===================================================== | 82% | |====================================================== | 83% | |======================================================= | 85% | |======================================================== | 86% | |======================================================== | 87% | |========================================================= | 88% | |========================================================== | 89% | |=========================================================== | 90% | |============================================================ | 92% | |============================================================ | 93% | |============================================================= | 94% | |============================================================== | 95% | |=============================================================== | 96% | |=============================================================== | 98% | |================================================================ | 99% | |=================================================================| 100% +``` + +```r +sctnorm_data$model_str +``` + +``` +## [1] "y ~ log10_total_counts_endogenous" ``` -Lastly, another common method for feature selection in scRNASeq data is to use PCA loadings. Genes with -high PCA loadings are likely to be highly variable and correlated with many other variable genes, thus -may be relevant to the underlying biology. However, as with gene-gene correlations PCA loadings tend to -be susceptible to detecting systematic variation due to batch effects; thus it is recommended to plot the PCA -results to determine those components corresponding to the biological variation rather than batch effects. ```r -# PCA is typically performed on log-transformed expression data -pca <- prcomp(log(expr_matrix + 1) / log(2)) +library(ggplot2) +ggplot(sctnorm_data$gene_attr, aes(residual_variance)) + + geom_histogram(binwidth=0.1) + + geom_vline(xintercept=1, color='red') + xlim(0, 10) +``` -# plot projection -plot( - pca$rotation[,1], - pca$rotation[,2], - pch = 16, - col = cell_colors[as.factor(celltype_labs)] -) -# calculate loadings for components 1 and 2 -score <- rowSums(abs(pca$x[,c(1,2)])) -names(score) <- rownames(expr_matrix) -score <- score[order(-score)] -PCA_genes <- names(score[1:1500]) +<img src="feature-selection_files/figure-html/sctransform-feature-select-1.png" width="90%" style="display: block; margin: auto;" /> + +```r +sctnorm_data$gene_attr$label <- rownames(sctnorm_data$gene_attr) +ggplot(sctnorm_data$gene_attr, aes(x = gmean, y=residual_variance)) + + geom_point(alpha = 0.6) + + geom_point(colour = "firebrick2", + data = sctnorm_data$gene_attr[sctnorm_data$gene_attr$residual_variance > 3,]) + + scale_x_log10() + + geom_hline(yintercept = 1, size = 3, color = "dodgerblue") + + geom_label(aes(label = label), + data = sctnorm_data$gene_attr[sctnorm_data$gene_attr$residual_variance > 30,]) + + theme_bw() ``` -__Exercise 6__ -Consider the top 5 principal components. Which appear to be most biologically relevant? How does the top 1,500 -features change if you consider the loadings for those components? + +<img src="feature-selection_files/figure-html/sctransform-feature-select-2.png" width="90%" style="display: block; margin: auto;" /> + +```r +sct_genes <- rownames(sctnorm_data$gene_attr)[sctnorm_data$gene_attr$residual_variance > 4] +table(sctnorm_data$gene_attr$residual_variance > 4) +``` + +``` +## +## FALSE TRUE +## 20077 1220 +``` + +If we set a (relatively arbitrary) threshold of a residual variance greater than +three marking a "highly variable gene", then we identify around 2000 highly +variable genes with this `sctransform` approach. + + +[NB: the `deng` data is extremely high depth for scRNA-seq data, so not the most +applicable dataset for `sctransform`, but we include this analysis here to +demonstrate the method rather than make any evaluation of its performance in +general.] + + +Although not explored here, the _deviance_ statistic from the regularized NB GLM +fit provides a natural way to select informative features for downstream +analyses. + +The [deviance](https://en.wikipedia.org/wiki/Deviance_(statistics)) is a +goodness-of-fit statistic for a statistical model. As Wikipedia notes, deviance +is a generalization of the idea of using the sum of squares of residuals in +ordinary least squares to cases where model-fitting is achieved by maximum +likelihood. It plays an important role in exponential dispersion models and +generalized linear models, such as the negative binomial model. + +However, `sctransform` does not seem set up to use the model deviance to select +informative features, but we expect this could be a direction the field goes in +the near future. Keep an eye out! + + +### Correlated Expression + +A completely different approach to feature selection is to use gene-gene +correlations. This method is based on the idea that multiple genes will be +differentially expressed between different cell-types or cell-states. Genes +which are expressed in the same cell-population will be positively correlated +with each other where as genes expressed in different cell-populations will be +negatively correated with each other. Thus important genes can be identified by +the magnitude of their correlation with other genes. + +The limitation of this method is that it assumes technical noise is random and +independent for each cell, thus shouldn't produce gene-gene correlations, but +this assumption is violated by batch effects which are generally systematic +between different experimental batches and will produce gene-gene correlations. +As a result it is more appropriate to take the top few thousand genes as ranked +by gene-gene correlation than consider the significance of the correlations. + + + +```r +cor_feat <- M3Drop::corFS(expr_matrix) +Cor_genes <- names(cor_feat)[1:1500] +``` + ### Comparing Methods -We can check whether the identified features really do represent genes differentially expressed between -cell-types in this dataset. +We can check whether the identified features really do represent genes +differentially expressed between cell-types in this dataset. ```r @@ -309,15 +516,32 @@ M3DropExpressionHeatmap( ) ``` +<img src="feature-selection_files/figure-html/unnamed-chunk-16-1.png" width="90%" style="display: block; margin: auto;" /> + We can also consider how consistent each feature selection method is with the others using the Jaccard Index: + ```r J <- sum(M3Drop_genes %in% HVG_genes)/length(unique(c(M3Drop_genes, HVG_genes))) ``` -__Exercise 7__ +__Exercise 6__ + +Plot the expression of the features for each of the other methods. Which appear +to be differentially expressed? How consistent are the different methods for +this dataset? + + +```r +M3DropExpressionHeatmap( + DANB_genes, + expr_matrix, + cell_labels = celltype_labs +) +``` + +<img src="feature-selection_files/figure-html/unnamed-chunk-18-1.png" width="90%" style="display: block; margin: auto;" /> -Plot the expression of the features for each of the other methods. Which appear to be differentially expressed? How consistent are the different methods for this dataset? @@ -327,9 +551,134 @@ Plot the expression of the features for each of the other methods. Which appear +Jaccard index comparison of sets of informative features: +```r +list_of_features <- list( + M3Drop_genes, + DANB_genes, + HVG_genes, + simplesinglecell_genes, + sct_genes +) +Out <- matrix( + 0, + ncol = length(list_of_features), + nrow = length(list_of_features) +) +for(i in 1:length(list_of_features) ) { + for(j in 1:length(list_of_features) ) { + Out[i,j] <- sum(list_of_features[[i]] %in% list_of_features[[j]])/ + length(unique(c(list_of_features[[i]], list_of_features[[j]]))) + } +} +colnames(Out) <- rownames(Out) <- c("M3Drop", "DANB", "Brennecke", "simpleSingleCell", "sctransform") +Out +``` + +``` +## M3Drop DANB Brennecke simpleSingleCell +## M3Drop 1.0000000 0.38019061 0.4152905 0.14615908 +## DANB 0.3801906 1.00000000 0.2283346 0.09868187 +## Brennecke 0.4152905 0.22833459 1.0000000 0.15019157 +## simpleSingleCell 0.1461591 0.09868187 0.1501916 1.00000000 +## sctransform 0.2343257 0.21801471 0.2718985 0.26034913 +## sctransform +## M3Drop 0.2343257 +## DANB 0.2180147 +## Brennecke 0.2718985 +## simpleSingleCell 0.2603491 +## sctransform 1.0000000 +``` + ### sessionInfo() +``` +## R version 3.6.0 (2019-04-26) +## Platform: x86_64-pc-linux-gnu (64-bit) +## Running under: Ubuntu 18.04.3 LTS +## +## Matrix products: default +## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 +## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 +## +## locale: +## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C +## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 +## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 +## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C +## [9] LC_ADDRESS=C LC_TELEPHONE=C +## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C +## +## attached base packages: +## [1] parallel stats4 stats graphics grDevices utils datasets +## [8] methods base +## +## other attached packages: +## [1] scran_1.12.1 scater_1.12.2 +## [3] ggplot2_3.2.1 Polychrome_1.2.3 +## [5] SingleCellExperiment_1.6.0 SummarizedExperiment_1.14.1 +## [7] DelayedArray_0.10.0 BiocParallel_1.18.1 +## [9] Biobase_2.44.0 GenomicRanges_1.36.1 +## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 +## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 +## [15] RColorBrewer_1.1-2 M3Drop_1.10.0 +## [17] numDeriv_2016.8-1.1 matrixStats_0.55.0 +## [19] scRNA.seq.funcs_0.1.0 +## +## loaded via a namespace (and not attached): +## [1] Rtsne_0.15 ggbeeswarm_0.6.0 +## [3] colorspace_1.4-1 dynamicTreeCut_1.63-1 +## [5] htmlTable_1.13.2 XVector_0.24.0 +## [7] base64enc_0.1-3 BiocNeighbors_1.2.0 +## [9] rstudioapi_0.10 listenv_0.7.0 +## [11] codetools_0.2-16 splines_3.6.0 +## [13] knitr_1.25 Formula_1.2-3 +## [15] cluster_2.1.0 sctransform_0.2.0 +## [17] compiler_3.6.0 dqrng_0.2.1 +## [19] backports_1.1.4 assertthat_0.2.1 +## [21] Matrix_1.2-17 lazyeval_0.2.2 +## [23] limma_3.40.6 BiocSingular_1.0.0 +## [25] acepack_1.4.1 htmltools_0.3.6 +## [27] tools_3.6.0 rsvd_1.0.2 +## [29] igraph_1.2.4.1 gtable_0.3.0 +## [31] glue_1.3.1 GenomeInfoDbData_1.2.1 +## [33] reshape2_1.4.3 dplyr_0.8.3 +## [35] Rcpp_1.0.2 bbmle_1.0.20 +## [37] gdata_2.18.0 nlme_3.1-139 +## [39] DelayedMatrixStats_1.6.1 xfun_0.9 +## [41] stringr_1.4.0 globals_0.12.4 +## [43] irlba_2.3.3 gtools_3.8.1 +## [45] hypergeo_1.2-13 statmod_1.4.32 +## [47] future_1.14.0 edgeR_3.26.8 +## [49] zlibbioc_1.30.0 MASS_7.3-51.1 +## [51] scales_1.0.0 yaml_2.2.0 +## [53] gridExtra_2.3 rpart_4.1-15 +## [55] latticeExtra_0.6-28 stringi_1.4.3 +## [57] checkmate_1.9.4 orthopolynom_1.0-5 +## [59] contfrac_1.1-12 caTools_1.17.1.2 +## [61] rlang_0.4.0 pkgconfig_2.0.3 +## [63] moments_0.14 bitops_1.0-6 +## [65] evaluate_0.14 lattice_0.20-38 +## [67] purrr_0.3.2 htmlwidgets_1.3 +## [69] labeling_0.3 cowplot_1.0.0 +## [71] tidyselect_0.2.5 deSolve_1.24 +## [73] plyr_1.8.4 magrittr_1.5 +## [75] bookdown_0.13 R6_2.4.0 +## [77] gplots_3.0.1.1 Hmisc_4.2-0 +## [79] pillar_1.4.2 foreign_0.8-70 +## [81] withr_2.1.2 mgcv_1.8-28 +## [83] survival_2.43-3 scatterplot3d_0.3-41 +## [85] RCurl_1.95-4.12 nnet_7.3-12 +## [87] future.apply_1.3.0 tibble_2.1.3 +## [89] crayon_1.3.4 KernSmooth_2.23-15 +## [91] rmarkdown_1.15 viridis_0.5.1 +## [93] locfit_1.5-9.1 grid_3.6.0 +## [95] data.table_1.12.2 reldist_1.6-6 +## [97] digest_0.6.21 elliptic_1.4-0 +## [99] munsell_0.5.0 beeswarm_0.2.3 +## [101] viridisLite_0.3.0 vipor_0.4.5 +``` diff --git a/public/feature-selection_files/figure-html/hvg-simpleSingleCell-1.png b/public/feature-selection_files/figure-html/hvg-simpleSingleCell-1.png new file mode 100644 index 0000000000000000000000000000000000000000..be189e52217b49e9d15254366d6a8290081ba788 Binary files /dev/null and b/public/feature-selection_files/figure-html/hvg-simpleSingleCell-1.png differ diff --git a/public/feature-selection_files/figure-html/hvg-simpleSingleCell-2.png b/public/feature-selection_files/figure-html/hvg-simpleSingleCell-2.png new file mode 100644 index 0000000000000000000000000000000000000000..9262a07cfc891aa5b4432853af45383eb43e87c9 Binary files /dev/null and b/public/feature-selection_files/figure-html/hvg-simpleSingleCell-2.png differ diff --git a/public/feature-selection_files/figure-html/sctransform-feature-select-1.png b/public/feature-selection_files/figure-html/sctransform-feature-select-1.png new file mode 100644 index 0000000000000000000000000000000000000000..030a8ee4981994c89499c157c2d4e74ea02c50ed Binary files /dev/null and b/public/feature-selection_files/figure-html/sctransform-feature-select-1.png differ diff --git a/public/feature-selection_files/figure-html/sctransform-feature-select-2.png b/public/feature-selection_files/figure-html/sctransform-feature-select-2.png new file mode 100644 index 0000000000000000000000000000000000000000..4a157fe0fba2c645f86b86ae131de437f88123df Binary files /dev/null and b/public/feature-selection_files/figure-html/sctransform-feature-select-2.png differ diff --git a/public/feature-selection_files/figure-html/unnamed-chunk-10-1.png b/public/feature-selection_files/figure-html/unnamed-chunk-10-1.png new file mode 100644 index 0000000000000000000000000000000000000000..c0b9c6331a9bf92574b50fc3b00ab86ded2ec46c Binary files /dev/null and b/public/feature-selection_files/figure-html/unnamed-chunk-10-1.png differ diff --git a/public/feature-selection_files/figure-html/unnamed-chunk-12-1.png b/public/feature-selection_files/figure-html/unnamed-chunk-12-1.png new file mode 100644 index 0000000000000000000000000000000000000000..427efbcbedf0d153b9dcfe99774a59b6e2bd5f66 Binary files /dev/null and b/public/feature-selection_files/figure-html/unnamed-chunk-12-1.png differ diff --git a/public/feature-selection_files/figure-html/unnamed-chunk-13-1.png b/public/feature-selection_files/figure-html/unnamed-chunk-13-1.png new file mode 100644 index 0000000000000000000000000000000000000000..22f7236bf61c708743f11cf0eb95c2386b8a5370 Binary files /dev/null and b/public/feature-selection_files/figure-html/unnamed-chunk-13-1.png differ diff --git a/public/feature-selection_files/figure-html/unnamed-chunk-16-1.png b/public/feature-selection_files/figure-html/unnamed-chunk-16-1.png new file mode 100644 index 0000000000000000000000000000000000000000..ae725c711aabf2823fefc1f9c075d3cf90ff0e06 Binary files /dev/null and b/public/feature-selection_files/figure-html/unnamed-chunk-16-1.png differ diff --git a/public/feature-selection_files/figure-html/unnamed-chunk-18-1.png b/public/feature-selection_files/figure-html/unnamed-chunk-18-1.png new file mode 100644 index 0000000000000000000000000000000000000000..30e1082d60ba9488c2994cb9ed6f6568c7891a21 Binary files /dev/null and b/public/feature-selection_files/figure-html/unnamed-chunk-18-1.png differ diff --git a/public/feature-selection_files/figure-html/unnamed-chunk-6-1.png b/public/feature-selection_files/figure-html/unnamed-chunk-6-1.png new file mode 100644 index 0000000000000000000000000000000000000000..4f9ea294b573253a32e655f7d3d0fca174daebc0 Binary files /dev/null and b/public/feature-selection_files/figure-html/unnamed-chunk-6-1.png differ diff --git a/public/feature-selection_files/figure-html/unnamed-chunk-7-1.png b/public/feature-selection_files/figure-html/unnamed-chunk-7-1.png new file mode 100644 index 0000000000000000000000000000000000000000..03b67c3f9e82d14d438c04f4c9e4e1a9d6a2486d Binary files /dev/null and b/public/feature-selection_files/figure-html/unnamed-chunk-7-1.png differ diff --git a/public/figures/FA.png b/public/figures/FA.png index ebbfebffbc374e73c134c2d70734b7a3ddb4970b..5e17a7274d3236635e3d34e780dce290b25c49a0 100644 Binary files a/public/figures/FA.png and b/public/figures/FA.png differ diff --git a/public/figures/FA_matrix.png b/public/figures/FA_matrix.png new file mode 100644 index 0000000000000000000000000000000000000000..c2d2b68fcf2024990556f5cf864772bbef309eea Binary files /dev/null and b/public/figures/FA_matrix.png differ diff --git a/public/figures/phate.png b/public/figures/phate.png new file mode 100644 index 0000000000000000000000000000000000000000..cf6d65ae2c1d8ca0a6df131bf3ab3f727ac16b8a Binary files /dev/null and b/public/figures/phate.png differ diff --git a/public/figures/slab_spike.png b/public/figures/slab_spike.png new file mode 100644 index 0000000000000000000000000000000000000000..d88d5916719cbcf546e3a1828f7f22513ede3d22 Binary files /dev/null and b/public/figures/slab_spike.png differ diff --git a/public/figures/slalom_anno.png b/public/figures/slalom_anno.png new file mode 100644 index 0000000000000000000000000000000000000000..77a37b361609bab3e0a76603f7c9f61611f4d34d Binary files /dev/null and b/public/figures/slalom_anno.png differ diff --git a/public/handling-sparsity.html b/public/handling-sparsity.html index daf886ed4f2f9931c1fb723421a5772a00c8b0bd..62aeaab025018804589ac4b58f1e08632f0502eb 100644 --- a/public/handling-sparsity.html +++ b/public/handling-sparsity.html @@ -339,7 +339,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.4.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-3"><i class="fa fa-check"></i><b>7.4.6</b> sessionInfo()</a></li> </ul></li> <li class="chapter" data-level="7.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#identifying-confounding-factors-reads"><i class="fa fa-check"></i><b>7.5</b> Identifying confounding factors (Reads)</a></li> -<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#dealing-with-confounders"><i class="fa fa-check"></i><b>7.6</b> Dealing with confounders</a><ul> +<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#batch-effects"><i class="fa fa-check"></i><b>7.6</b> Batch effects</a><ul> <li class="chapter" data-level="7.6.1" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#introduction-6"><i class="fa fa-check"></i><b>7.6.1</b> Introduction</a></li> <li class="chapter" data-level="7.6.2" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#linear-models"><i class="fa fa-check"></i><b>7.6.2</b> Linear models</a></li> <li class="chapter" data-level="7.6.3" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sctransform-2"><i class="fa fa-check"></i><b>7.6.3</b> sctransform</a></li> @@ -347,7 +347,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.6.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#combat"><i class="fa fa-check"></i><b>7.6.5</b> Combat</a></li> <li class="chapter" data-level="7.6.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#mnncorrect"><i class="fa fa-check"></i><b>7.6.6</b> mnnCorrect</a></li> <li class="chapter" data-level="7.6.7" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#harmony"><i class="fa fa-check"></i><b>7.6.7</b> Harmony</a></li> -<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-confounder-removal-strategies"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare confounder removal strategies</a></li> +<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-batch-correction"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare batch correction</a></li> <li class="chapter" data-level="7.6.9" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#big-exercise-2"><i class="fa fa-check"></i><b>7.6.9</b> Big Exercise</a></li> <li class="chapter" data-level="7.6.10" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-4"><i class="fa fa-check"></i><b>7.6.10</b> sessionInfo()</a></li> </ul></li> @@ -370,14 +370,13 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="9.1.2" data-path="latent-spaces.html"><a href="latent-spaces.html#tsne-t-distributed-stochastic-neighbor-embedding"><i class="fa fa-check"></i><b>9.1.2</b> tSNE: t-Distributed Stochastic Neighbor Embedding</a></li> <li class="chapter" data-level="9.1.3" data-path="latent-spaces.html"><a href="latent-spaces.html#manifold-methods"><i class="fa fa-check"></i><b>9.1.3</b> Manifold methods</a></li> </ul></li> -<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a></li> +<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a><ul> +<li class="chapter" data-level="9.2.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom-interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.2.1</b> <span>Slalom</span>: Interpretable latent spaces</a></li> +</ul></li> <li class="chapter" data-level="9.3" data-path="latent-spaces.html"><a href="latent-spaces.html#autoencoders"><i class="fa fa-check"></i><b>9.3</b> Autoencoders</a><ul> <li class="chapter" data-level="9.3.1" data-path="latent-spaces.html"><a href="latent-spaces.html#background-and-some-notations"><i class="fa fa-check"></i><b>9.3.1</b> Background and some notations</a></li> <li class="chapter" data-level="9.3.2" data-path="latent-spaces.html"><a href="latent-spaces.html#objective"><i class="fa fa-check"></i><b>9.3.2</b> Objective</a></li> </ul></li> -<li class="chapter" data-level="9.4" data-path="latent-spaces.html"><a href="latent-spaces.html#interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.4</b> Interpretable latent spaces</a><ul> -<li class="chapter" data-level="9.4.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom"><i class="fa fa-check"></i><b>9.4.1</b> Slalom</a></li> -</ul></li> </ul></li> <li class="chapter" data-level="10" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html"><i class="fa fa-check"></i><b>10</b> Clustering and cell annotation</a><ul> <li class="chapter" data-level="10.1" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html#clustering-methods"><i class="fa fa-check"></i><b>10.1</b> Clustering Methods</a><ul> @@ -399,15 +398,16 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="11.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#first-look-at-deng-data"><i class="fa fa-check"></i><b>11.1</b> First look at Deng data</a><ul> <li class="chapter" data-level="11.1.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#tscan"><i class="fa fa-check"></i><b>11.1.1</b> TSCAN</a></li> <li class="chapter" data-level="11.1.2" data-path="trajectory-inference.html"><a href="trajectory-inference.html#slingshot"><i class="fa fa-check"></i><b>11.1.2</b> Slingshot</a></li> -<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.3</b> Monocle</a></li> -<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.4</b> Monocle 2</a></li> -<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 3</a></li> -<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.6</b> Diffusion maps</a></li> -<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.7</b> Other methods</a></li> -<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.8</b> Comparison of the methods</a></li> -<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.9</b> Expression of genes through time</a></li> -<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.10</b> dynverse</a></li> -<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.11</b> sessionInfo()</a></li> +<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#gam-general-additive-model-for-identifying-temporally-expressed-genes"><i class="fa fa-check"></i><b>11.1.3</b> GAM general additive model for identifying temporally expressed genes</a></li> +<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.4</b> Monocle</a></li> +<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 2</a></li> +<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.6</b> Monocle 3</a></li> +<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.7</b> Diffusion maps</a></li> +<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.8</b> Other methods</a></li> +<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.9</b> Comparison of the methods</a></li> +<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.10</b> Expression of genes through time</a></li> +<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.11</b> dynverse</a></li> +<li class="chapter" data-level="11.1.12" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.12</b> sessionInfo()</a></li> </ul></li> </ul></li> <li class="chapter" data-level="12" data-path="dechapter.html"><a href="dechapter.html"><i class="fa fa-check"></i><b>12</b> Differential Expression (DE) analysis</a><ul> @@ -644,7 +644,7 @@ Ideally, such a benchmarking platform would remain dynamic beyond an initial pub Detailed benchmarking would also help to establish when normalization methods derived from explicit count models may be preferable to imputation.</p> <p>Finally, scalability for large numbers of cells remains an ongoing concern for imputation, data smoothing and data reconstruction methods, as for all high-throughput single-cell methods and software (see ).</p> -<div class="sourceCode" id="cb553"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb553-1" data-line-number="1"><span class="kw">library</span>(scater)</a></code></pre></div> +<div class="sourceCode" id="cb621"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb621-1" data-line-number="1"><span class="kw">library</span>(scater)</a></code></pre></div> <pre><code>## Loading required package: SingleCellExperiment</code></pre> <pre><code>## Loading required package: SummarizedExperiment</code></pre> <pre><code>## Loading required package: GenomicRanges</code></pre> @@ -709,10 +709,11 @@ Detailed benchmarking would also help to establish when normalization methods de <pre><code>## The following object is masked from 'package:stats': ## ## filter</code></pre> -<div class="sourceCode" id="cb583"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb583-1" data-line-number="1"><span class="kw">library</span>(SingleCellExperiment)</a> -<a class="sourceLine" id="cb583-2" data-line-number="2"><span class="kw">library</span>(glmpca)</a> -<a class="sourceLine" id="cb583-3" data-line-number="3"><span class="kw">library</span>(ggplot2)</a> -<a class="sourceLine" id="cb583-4" data-line-number="4"><span class="kw">library</span>(Polychrome)</a></code></pre></div> +<div class="sourceCode" id="cb651"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb651-1" data-line-number="1"><span class="kw">library</span>(SingleCellExperiment)</a> +<a class="sourceLine" id="cb651-2" data-line-number="2"><span class="kw">library</span>(glmpca)</a> +<a class="sourceLine" id="cb651-3" data-line-number="3"><span class="kw">library</span>(ggplot2)</a> +<a class="sourceLine" id="cb651-4" data-line-number="4"><span class="kw">library</span>(Polychrome)</a> +<a class="sourceLine" id="cb651-5" data-line-number="5"><span class="kw">library</span>(slalom)</a></code></pre></div> </div> </div> <h3> References</h3> diff --git a/public/ideal-scrnaseq-pipeline-as-of-oct-2019.html b/public/ideal-scrnaseq-pipeline-as-of-oct-2019.html index a59ab54a0119c7a774f14c220eaa2f4c196aac8f..28e1dc3b6157c9d70c0eb0c926d7468a57ca8939 100644 --- a/public/ideal-scrnaseq-pipeline-as-of-oct-2019.html +++ b/public/ideal-scrnaseq-pipeline-as-of-oct-2019.html @@ -339,7 +339,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.4.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-3"><i class="fa fa-check"></i><b>7.4.6</b> sessionInfo()</a></li> </ul></li> <li class="chapter" data-level="7.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#identifying-confounding-factors-reads"><i class="fa fa-check"></i><b>7.5</b> Identifying confounding factors (Reads)</a></li> -<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#dealing-with-confounders"><i class="fa fa-check"></i><b>7.6</b> Dealing with confounders</a><ul> +<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#batch-effects"><i class="fa fa-check"></i><b>7.6</b> Batch effects</a><ul> <li class="chapter" data-level="7.6.1" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#introduction-6"><i class="fa fa-check"></i><b>7.6.1</b> Introduction</a></li> <li class="chapter" data-level="7.6.2" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#linear-models"><i class="fa fa-check"></i><b>7.6.2</b> Linear models</a></li> <li class="chapter" data-level="7.6.3" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sctransform-2"><i class="fa fa-check"></i><b>7.6.3</b> sctransform</a></li> @@ -347,7 +347,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.6.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#combat"><i class="fa fa-check"></i><b>7.6.5</b> Combat</a></li> <li class="chapter" data-level="7.6.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#mnncorrect"><i class="fa fa-check"></i><b>7.6.6</b> mnnCorrect</a></li> <li class="chapter" data-level="7.6.7" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#harmony"><i class="fa fa-check"></i><b>7.6.7</b> Harmony</a></li> -<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-confounder-removal-strategies"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare confounder removal strategies</a></li> +<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-batch-correction"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare batch correction</a></li> <li class="chapter" data-level="7.6.9" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#big-exercise-2"><i class="fa fa-check"></i><b>7.6.9</b> Big Exercise</a></li> <li class="chapter" data-level="7.6.10" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-4"><i class="fa fa-check"></i><b>7.6.10</b> sessionInfo()</a></li> </ul></li> @@ -370,14 +370,13 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="9.1.2" data-path="latent-spaces.html"><a href="latent-spaces.html#tsne-t-distributed-stochastic-neighbor-embedding"><i class="fa fa-check"></i><b>9.1.2</b> tSNE: t-Distributed Stochastic Neighbor Embedding</a></li> <li class="chapter" data-level="9.1.3" data-path="latent-spaces.html"><a href="latent-spaces.html#manifold-methods"><i class="fa fa-check"></i><b>9.1.3</b> Manifold methods</a></li> </ul></li> -<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a></li> +<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a><ul> +<li class="chapter" data-level="9.2.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom-interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.2.1</b> <span>Slalom</span>: Interpretable latent spaces</a></li> +</ul></li> <li class="chapter" data-level="9.3" data-path="latent-spaces.html"><a href="latent-spaces.html#autoencoders"><i class="fa fa-check"></i><b>9.3</b> Autoencoders</a><ul> <li class="chapter" data-level="9.3.1" data-path="latent-spaces.html"><a href="latent-spaces.html#background-and-some-notations"><i class="fa fa-check"></i><b>9.3.1</b> Background and some notations</a></li> <li class="chapter" data-level="9.3.2" data-path="latent-spaces.html"><a href="latent-spaces.html#objective"><i class="fa fa-check"></i><b>9.3.2</b> Objective</a></li> </ul></li> -<li class="chapter" data-level="9.4" data-path="latent-spaces.html"><a href="latent-spaces.html#interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.4</b> Interpretable latent spaces</a><ul> -<li class="chapter" data-level="9.4.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom"><i class="fa fa-check"></i><b>9.4.1</b> Slalom</a></li> -</ul></li> </ul></li> <li class="chapter" data-level="10" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html"><i class="fa fa-check"></i><b>10</b> Clustering and cell annotation</a><ul> <li class="chapter" data-level="10.1" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html#clustering-methods"><i class="fa fa-check"></i><b>10.1</b> Clustering Methods</a><ul> @@ -399,15 +398,16 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="11.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#first-look-at-deng-data"><i class="fa fa-check"></i><b>11.1</b> First look at Deng data</a><ul> <li class="chapter" data-level="11.1.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#tscan"><i class="fa fa-check"></i><b>11.1.1</b> TSCAN</a></li> <li class="chapter" data-level="11.1.2" data-path="trajectory-inference.html"><a href="trajectory-inference.html#slingshot"><i class="fa fa-check"></i><b>11.1.2</b> Slingshot</a></li> -<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.3</b> Monocle</a></li> -<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.4</b> Monocle 2</a></li> -<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 3</a></li> -<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.6</b> Diffusion maps</a></li> -<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.7</b> Other methods</a></li> -<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.8</b> Comparison of the methods</a></li> -<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.9</b> Expression of genes through time</a></li> -<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.10</b> dynverse</a></li> -<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.11</b> sessionInfo()</a></li> +<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#gam-general-additive-model-for-identifying-temporally-expressed-genes"><i class="fa fa-check"></i><b>11.1.3</b> GAM general additive model for identifying temporally expressed genes</a></li> +<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.4</b> Monocle</a></li> +<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 2</a></li> +<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.6</b> Monocle 3</a></li> +<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.7</b> Diffusion maps</a></li> +<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.8</b> Other methods</a></li> +<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.9</b> Comparison of the methods</a></li> +<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.10</b> Expression of genes through time</a></li> +<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.11</b> dynverse</a></li> +<li class="chapter" data-level="11.1.12" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.12</b> sessionInfo()</a></li> </ul></li> </ul></li> <li class="chapter" data-level="12" data-path="dechapter.html"><a href="dechapter.html"><i class="fa fa-check"></i><b>12</b> Differential Expression (DE) analysis</a><ul> @@ -538,7 +538,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li>Cell clustering & cell-type identification benefits from large number of cells and doesn’t requireas high sequencing depth (~100,000 reads per cell).</li> </ul></li> </ul> -<div class="sourceCode" id="cb820"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb820-1" data-line-number="1">knitr<span class="op">::</span><span class="kw">include_graphics</span>(<span class="st">"figures/Pipeline-batches.png"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb903"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb903-1" data-line-number="1">knitr<span class="op">::</span><span class="kw">include_graphics</span>(<span class="st">"figures/Pipeline-batches.png"</span>)</a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:pipeline-batches"></span> <img src="figures/Pipeline-batches.png" alt="Appropriate approaches to batch effects in scRNASeq. Red arrows indicate batch effects which are (pale) or are not (vibrant) correctable through batch-correction." width="90%" /> <p class="caption"> diff --git a/public/imputation.html b/public/imputation.html index 1aa9ab6e9efcd6be5ac322bc9876986d7093cb3b..ad2ed2c5ddbfd48b0c77d090f45e8d351183376b 100644 --- a/public/imputation.html +++ b/public/imputation.html @@ -339,7 +339,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.4.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-3"><i class="fa fa-check"></i><b>7.4.6</b> sessionInfo()</a></li> </ul></li> <li class="chapter" data-level="7.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#identifying-confounding-factors-reads"><i class="fa fa-check"></i><b>7.5</b> Identifying confounding factors (Reads)</a></li> -<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#dealing-with-confounders"><i class="fa fa-check"></i><b>7.6</b> Dealing with confounders</a><ul> +<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#batch-effects"><i class="fa fa-check"></i><b>7.6</b> Batch effects</a><ul> <li class="chapter" data-level="7.6.1" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#introduction-6"><i class="fa fa-check"></i><b>7.6.1</b> Introduction</a></li> <li class="chapter" data-level="7.6.2" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#linear-models"><i class="fa fa-check"></i><b>7.6.2</b> Linear models</a></li> <li class="chapter" data-level="7.6.3" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sctransform-2"><i class="fa fa-check"></i><b>7.6.3</b> sctransform</a></li> @@ -347,7 +347,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.6.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#combat"><i class="fa fa-check"></i><b>7.6.5</b> Combat</a></li> <li class="chapter" data-level="7.6.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#mnncorrect"><i class="fa fa-check"></i><b>7.6.6</b> mnnCorrect</a></li> <li class="chapter" data-level="7.6.7" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#harmony"><i class="fa fa-check"></i><b>7.6.7</b> Harmony</a></li> -<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-confounder-removal-strategies"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare confounder removal strategies</a></li> +<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-batch-correction"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare batch correction</a></li> <li class="chapter" data-level="7.6.9" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#big-exercise-2"><i class="fa fa-check"></i><b>7.6.9</b> Big Exercise</a></li> <li class="chapter" data-level="7.6.10" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-4"><i class="fa fa-check"></i><b>7.6.10</b> sessionInfo()</a></li> </ul></li> @@ -370,14 +370,13 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="9.1.2" data-path="latent-spaces.html"><a href="latent-spaces.html#tsne-t-distributed-stochastic-neighbor-embedding"><i class="fa fa-check"></i><b>9.1.2</b> tSNE: t-Distributed Stochastic Neighbor Embedding</a></li> <li class="chapter" data-level="9.1.3" data-path="latent-spaces.html"><a href="latent-spaces.html#manifold-methods"><i class="fa fa-check"></i><b>9.1.3</b> Manifold methods</a></li> </ul></li> -<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a></li> +<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a><ul> +<li class="chapter" data-level="9.2.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom-interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.2.1</b> <span>Slalom</span>: Interpretable latent spaces</a></li> +</ul></li> <li class="chapter" data-level="9.3" data-path="latent-spaces.html"><a href="latent-spaces.html#autoencoders"><i class="fa fa-check"></i><b>9.3</b> Autoencoders</a><ul> <li class="chapter" data-level="9.3.1" data-path="latent-spaces.html"><a href="latent-spaces.html#background-and-some-notations"><i class="fa fa-check"></i><b>9.3.1</b> Background and some notations</a></li> <li class="chapter" data-level="9.3.2" data-path="latent-spaces.html"><a href="latent-spaces.html#objective"><i class="fa fa-check"></i><b>9.3.2</b> Objective</a></li> </ul></li> -<li class="chapter" data-level="9.4" data-path="latent-spaces.html"><a href="latent-spaces.html#interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.4</b> Interpretable latent spaces</a><ul> -<li class="chapter" data-level="9.4.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom"><i class="fa fa-check"></i><b>9.4.1</b> Slalom</a></li> -</ul></li> </ul></li> <li class="chapter" data-level="10" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html"><i class="fa fa-check"></i><b>10</b> Clustering and cell annotation</a><ul> <li class="chapter" data-level="10.1" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html#clustering-methods"><i class="fa fa-check"></i><b>10.1</b> Clustering Methods</a><ul> @@ -399,15 +398,16 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="11.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#first-look-at-deng-data"><i class="fa fa-check"></i><b>11.1</b> First look at Deng data</a><ul> <li class="chapter" data-level="11.1.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#tscan"><i class="fa fa-check"></i><b>11.1.1</b> TSCAN</a></li> <li class="chapter" data-level="11.1.2" data-path="trajectory-inference.html"><a href="trajectory-inference.html#slingshot"><i class="fa fa-check"></i><b>11.1.2</b> Slingshot</a></li> -<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.3</b> Monocle</a></li> -<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.4</b> Monocle 2</a></li> -<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 3</a></li> -<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.6</b> Diffusion maps</a></li> -<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.7</b> Other methods</a></li> -<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.8</b> Comparison of the methods</a></li> -<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.9</b> Expression of genes through time</a></li> -<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.10</b> dynverse</a></li> -<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.11</b> sessionInfo()</a></li> +<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#gam-general-additive-model-for-identifying-temporally-expressed-genes"><i class="fa fa-check"></i><b>11.1.3</b> GAM general additive model for identifying temporally expressed genes</a></li> +<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.4</b> Monocle</a></li> +<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 2</a></li> +<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.6</b> Monocle 3</a></li> +<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.7</b> Diffusion maps</a></li> +<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.8</b> Other methods</a></li> +<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.9</b> Comparison of the methods</a></li> +<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.10</b> Expression of genes through time</a></li> +<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.11</b> dynverse</a></li> +<li class="chapter" data-level="11.1.12" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.12</b> sessionInfo()</a></li> </ul></li> </ul></li> <li class="chapter" data-level="12" data-path="dechapter.html"><a href="dechapter.html"><i class="fa fa-check"></i><b>12</b> Differential Expression (DE) analysis</a><ul> @@ -506,13 +506,13 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <section class="normal" id="section-"> <div id="imputation" class="section level1"> <h1><span class="header-section-number">13</span> Imputation</h1> -<div class="sourceCode" id="cb726"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb726-1" data-line-number="1"><span class="kw">library</span>(scImpute)</a> -<a class="sourceLine" id="cb726-2" data-line-number="2"><span class="kw">library</span>(SC3)</a> -<a class="sourceLine" id="cb726-3" data-line-number="3"><span class="kw">library</span>(scater)</a> -<a class="sourceLine" id="cb726-4" data-line-number="4"><span class="kw">library</span>(SingleCellExperiment)</a> -<a class="sourceLine" id="cb726-5" data-line-number="5"><span class="kw">library</span>(mclust)</a> -<a class="sourceLine" id="cb726-6" data-line-number="6"><span class="kw">library</span>(DrImpute)</a> -<a class="sourceLine" id="cb726-7" data-line-number="7"><span class="kw">set.seed</span>(<span class="dv">1234567</span>)</a></code></pre></div> +<div class="sourceCode" id="cb809"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb809-1" data-line-number="1"><span class="kw">library</span>(scImpute)</a> +<a class="sourceLine" id="cb809-2" data-line-number="2"><span class="kw">library</span>(SC3)</a> +<a class="sourceLine" id="cb809-3" data-line-number="3"><span class="kw">library</span>(scater)</a> +<a class="sourceLine" id="cb809-4" data-line-number="4"><span class="kw">library</span>(SingleCellExperiment)</a> +<a class="sourceLine" id="cb809-5" data-line-number="5"><span class="kw">library</span>(mclust)</a> +<a class="sourceLine" id="cb809-6" data-line-number="6"><span class="kw">library</span>(DrImpute)</a> +<a class="sourceLine" id="cb809-7" data-line-number="7"><span class="kw">set.seed</span>(<span class="dv">1234567</span>)</a></code></pre></div> <p>As discussed previously, one of the main challenges when analyzing scRNA-seq data is the presence of zeros, or dropouts. The dropouts are assumed to have arisen for three possible reasons:</p> @@ -541,61 +541,61 @@ imputed all inflated zeros.</p> <div id="scimpute" class="section level3"> <h3><span class="header-section-number">13.0.1</span> scImpute</h3> <p>To test <code>scImpute</code>, we use the default parameters and we apply it to the Deng dataset that we have worked with before. scImpute takes a .csv or .txt file as an input:</p> -<div class="sourceCode" id="cb727"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb727-1" data-line-number="1">deng <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/deng/deng-reads.rds"</span>)</a> -<a class="sourceLine" id="cb727-2" data-line-number="2"><span class="kw">write.csv</span>(<span class="kw">counts</span>(deng), <span class="st">"deng.csv"</span>)</a> -<a class="sourceLine" id="cb727-3" data-line-number="3"><span class="kw">scimpute</span>(</a> -<a class="sourceLine" id="cb727-4" data-line-number="4"> <span class="dt">count_path =</span> <span class="st">"deng.csv"</span>,</a> -<a class="sourceLine" id="cb727-5" data-line-number="5"> <span class="dt">infile =</span> <span class="st">"csv"</span>,</a> -<a class="sourceLine" id="cb727-6" data-line-number="6"> <span class="dt">outfile =</span> <span class="st">"txt"</span>, </a> -<a class="sourceLine" id="cb727-7" data-line-number="7"> <span class="dt">out_dir =</span> <span class="st">"./"</span>,</a> -<a class="sourceLine" id="cb727-8" data-line-number="8"> <span class="dt">Kcluster =</span> <span class="dv">10</span>,</a> -<a class="sourceLine" id="cb727-9" data-line-number="9"> <span class="dt">ncores =</span> <span class="dv">2</span></a> -<a class="sourceLine" id="cb727-10" data-line-number="10">)</a></code></pre></div> +<div class="sourceCode" id="cb810"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb810-1" data-line-number="1">deng <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/deng/deng-reads.rds"</span>)</a> +<a class="sourceLine" id="cb810-2" data-line-number="2"><span class="kw">write.csv</span>(<span class="kw">counts</span>(deng), <span class="st">"deng.csv"</span>)</a> +<a class="sourceLine" id="cb810-3" data-line-number="3"><span class="kw">scimpute</span>(</a> +<a class="sourceLine" id="cb810-4" data-line-number="4"> <span class="dt">count_path =</span> <span class="st">"deng.csv"</span>,</a> +<a class="sourceLine" id="cb810-5" data-line-number="5"> <span class="dt">infile =</span> <span class="st">"csv"</span>,</a> +<a class="sourceLine" id="cb810-6" data-line-number="6"> <span class="dt">outfile =</span> <span class="st">"txt"</span>, </a> +<a class="sourceLine" id="cb810-7" data-line-number="7"> <span class="dt">out_dir =</span> <span class="st">"./"</span>,</a> +<a class="sourceLine" id="cb810-8" data-line-number="8"> <span class="dt">Kcluster =</span> <span class="dv">10</span>,</a> +<a class="sourceLine" id="cb810-9" data-line-number="9"> <span class="dt">ncores =</span> <span class="dv">2</span></a> +<a class="sourceLine" id="cb810-10" data-line-number="10">)</a></code></pre></div> <p>Now we can compare the results with original data by considering a PCA plot</p> -<div class="sourceCode" id="cb728"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb728-1" data-line-number="1">res <-<span class="st"> </span><span class="kw">read.table</span>(<span class="st">"scimpute_count.txt"</span>)</a> -<a class="sourceLine" id="cb728-2" data-line-number="2"><span class="kw">colnames</span>(res) <-<span class="st"> </span><span class="ot">NULL</span></a> -<a class="sourceLine" id="cb728-3" data-line-number="3">res <-<span class="st"> </span><span class="kw">SingleCellExperiment</span>(</a> -<a class="sourceLine" id="cb728-4" data-line-number="4"> <span class="dt">assays =</span> <span class="kw">list</span>(<span class="dt">logcounts =</span> <span class="kw">log2</span>(<span class="kw">as.matrix</span>(res) <span class="op">+</span><span class="st"> </span><span class="dv">1</span>)), </a> -<a class="sourceLine" id="cb728-5" data-line-number="5"> <span class="dt">colData =</span> <span class="kw">colData</span>(deng)</a> -<a class="sourceLine" id="cb728-6" data-line-number="6">)</a> -<a class="sourceLine" id="cb728-7" data-line-number="7"><span class="kw">rowData</span>(res)<span class="op">$</span>feature_symbol <-<span class="st"> </span><span class="kw">rowData</span>(deng)<span class="op">$</span>feature_symbol</a> -<a class="sourceLine" id="cb728-8" data-line-number="8"><span class="kw">plotPCA</span>(</a> -<a class="sourceLine" id="cb728-9" data-line-number="9"> res, </a> -<a class="sourceLine" id="cb728-10" data-line-number="10"> <span class="dt">colour_by =</span> <span class="st">"cell_type2"</span></a> -<a class="sourceLine" id="cb728-11" data-line-number="11">)</a></code></pre></div> +<div class="sourceCode" id="cb811"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb811-1" data-line-number="1">res <-<span class="st"> </span><span class="kw">read.table</span>(<span class="st">"scimpute_count.txt"</span>)</a> +<a class="sourceLine" id="cb811-2" data-line-number="2"><span class="kw">colnames</span>(res) <-<span class="st"> </span><span class="ot">NULL</span></a> +<a class="sourceLine" id="cb811-3" data-line-number="3">res <-<span class="st"> </span><span class="kw">SingleCellExperiment</span>(</a> +<a class="sourceLine" id="cb811-4" data-line-number="4"> <span class="dt">assays =</span> <span class="kw">list</span>(<span class="dt">logcounts =</span> <span class="kw">log2</span>(<span class="kw">as.matrix</span>(res) <span class="op">+</span><span class="st"> </span><span class="dv">1</span>)), </a> +<a class="sourceLine" id="cb811-5" data-line-number="5"> <span class="dt">colData =</span> <span class="kw">colData</span>(deng)</a> +<a class="sourceLine" id="cb811-6" data-line-number="6">)</a> +<a class="sourceLine" id="cb811-7" data-line-number="7"><span class="kw">rowData</span>(res)<span class="op">$</span>feature_symbol <-<span class="st"> </span><span class="kw">rowData</span>(deng)<span class="op">$</span>feature_symbol</a> +<a class="sourceLine" id="cb811-8" data-line-number="8"><span class="kw">plotPCA</span>(</a> +<a class="sourceLine" id="cb811-9" data-line-number="9"> res, </a> +<a class="sourceLine" id="cb811-10" data-line-number="10"> <span class="dt">colour_by =</span> <span class="st">"cell_type2"</span></a> +<a class="sourceLine" id="cb811-11" data-line-number="11">)</a></code></pre></div> <p>Compare this result to the original data in Chapter <a href="clustering-and-cell-annotation.html#clust-methods">10.2</a>. What are the most significant differences?</p> <p>We can examine the expression of specific genes to directly see the effect of imputation on the expression distribution.</p> -<div class="sourceCode" id="cb729"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb729-1" data-line-number="1"><span class="kw">plotExpression</span>(res, <span class="kw">c</span>(<span class="st">"Sox2"</span>, <span class="st">"Eomes"</span>, <span class="st">"Zscan4d"</span>, <span class="st">"Fgf4"</span>))</a> -<a class="sourceLine" id="cb729-2" data-line-number="2"><span class="kw">plotExpression</span>(deng, <span class="kw">c</span>(<span class="st">"Sox2"</span>, <span class="st">"Eomes"</span>, <span class="st">"Zscan4d"</span>, <span class="st">"Fgf4"</span>))</a></code></pre></div> +<div class="sourceCode" id="cb812"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb812-1" data-line-number="1"><span class="kw">plotExpression</span>(res, <span class="kw">c</span>(<span class="st">"Sox2"</span>, <span class="st">"Eomes"</span>, <span class="st">"Zscan4d"</span>, <span class="st">"Fgf4"</span>))</a> +<a class="sourceLine" id="cb812-2" data-line-number="2"><span class="kw">plotExpression</span>(deng, <span class="kw">c</span>(<span class="st">"Sox2"</span>, <span class="st">"Eomes"</span>, <span class="st">"Zscan4d"</span>, <span class="st">"Fgf4"</span>))</a></code></pre></div> <p>To evaluate the impact of the imputation, we use <code>SC3</code> to cluster the imputed matrix</p> -<div class="sourceCode" id="cb730"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb730-1" data-line-number="1">res <-<span class="st"> </span><span class="kw">sc3_estimate_k</span>(res)</a> -<a class="sourceLine" id="cb730-2" data-line-number="2"><span class="kw">metadata</span>(res)<span class="op">$</span>sc3<span class="op">$</span>k_estimation</a> -<a class="sourceLine" id="cb730-3" data-line-number="3">res <-<span class="st"> </span><span class="kw">sc3</span>(res, <span class="dt">ks =</span> <span class="dv">10</span>, <span class="dt">n_cores =</span> <span class="dv">1</span>, <span class="dt">gene_filter =</span> <span class="ot">FALSE</span>)</a> -<a class="sourceLine" id="cb730-4" data-line-number="4"><span class="kw">adjustedRandIndex</span>(<span class="kw">colData</span>(deng)<span class="op">$</span>cell_type2, <span class="kw">colData</span>(res)<span class="op">$</span>sc3_<span class="dv">10</span>_clusters)</a> -<a class="sourceLine" id="cb730-5" data-line-number="5"><span class="kw">plotPCA</span>(</a> -<a class="sourceLine" id="cb730-6" data-line-number="6"> res, </a> -<a class="sourceLine" id="cb730-7" data-line-number="7"> <span class="dt">colour_by =</span> <span class="st">"sc3_10_clusters"</span></a> -<a class="sourceLine" id="cb730-8" data-line-number="8">)</a></code></pre></div> +<div class="sourceCode" id="cb813"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb813-1" data-line-number="1">res <-<span class="st"> </span><span class="kw">sc3_estimate_k</span>(res)</a> +<a class="sourceLine" id="cb813-2" data-line-number="2"><span class="kw">metadata</span>(res)<span class="op">$</span>sc3<span class="op">$</span>k_estimation</a> +<a class="sourceLine" id="cb813-3" data-line-number="3">res <-<span class="st"> </span><span class="kw">sc3</span>(res, <span class="dt">ks =</span> <span class="dv">10</span>, <span class="dt">n_cores =</span> <span class="dv">1</span>, <span class="dt">gene_filter =</span> <span class="ot">FALSE</span>)</a> +<a class="sourceLine" id="cb813-4" data-line-number="4"><span class="kw">adjustedRandIndex</span>(<span class="kw">colData</span>(deng)<span class="op">$</span>cell_type2, <span class="kw">colData</span>(res)<span class="op">$</span>sc3_<span class="dv">10</span>_clusters)</a> +<a class="sourceLine" id="cb813-5" data-line-number="5"><span class="kw">plotPCA</span>(</a> +<a class="sourceLine" id="cb813-6" data-line-number="6"> res, </a> +<a class="sourceLine" id="cb813-7" data-line-number="7"> <span class="dt">colour_by =</span> <span class="st">"sc3_10_clusters"</span></a> +<a class="sourceLine" id="cb813-8" data-line-number="8">)</a></code></pre></div> <p><strong>Exercise:</strong> Based on the PCA and the clustering results, do you think that imputation using <code>scImpute</code> is a good idea for the Deng dataset?</p> </div> <div id="drimpute" class="section level3"> <h3><span class="header-section-number">13.0.2</span> DrImpute</h3> <p>We can do the same for DrImpute. DrImpute runs on a log-normalized expression matrix directly in R, we generate this matrix using scater, then run DrImpute. Unlike scImpute, DrImpute considers the consensus imputation across a range of ks using two differ correlation distances:</p> -<div class="sourceCode" id="cb731"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb731-1" data-line-number="1">deng <-<span class="st"> </span><span class="kw">normalize</span>(deng)</a> -<a class="sourceLine" id="cb731-2" data-line-number="2">res <-<span class="st"> </span><span class="kw">DrImpute</span>(deng<span class="op">@</span>assays[[<span class="st">"logcounts"</span>]], <span class="dt">ks=</span><span class="dv">8</span><span class="op">:</span><span class="dv">12</span>)</a> -<a class="sourceLine" id="cb731-3" data-line-number="3"><span class="kw">colnames</span>(res) <-<span class="st"> </span><span class="kw">colnames</span>(deng)</a> -<a class="sourceLine" id="cb731-4" data-line-number="4"><span class="kw">rownames</span>(res) <-<span class="st"> </span><span class="kw">rownames</span>(deng)</a> -<a class="sourceLine" id="cb731-5" data-line-number="5">res <-<span class="st"> </span><span class="kw">SingleCellExperiment</span>(</a> -<a class="sourceLine" id="cb731-6" data-line-number="6"> <span class="dt">assays =</span> <span class="kw">list</span>(<span class="dt">logcounts =</span> <span class="kw">as.matrix</span>(res)), </a> -<a class="sourceLine" id="cb731-7" data-line-number="7"> <span class="dt">colData =</span> <span class="kw">colData</span>(deng)</a> -<a class="sourceLine" id="cb731-8" data-line-number="8">)</a> -<a class="sourceLine" id="cb731-9" data-line-number="9"><span class="kw">rowData</span>(res)<span class="op">$</span>feature_symbol <-<span class="st"> </span><span class="kw">rowData</span>(deng)<span class="op">$</span>feature_symbol</a> -<a class="sourceLine" id="cb731-10" data-line-number="10"><span class="kw">plotPCA</span>(</a> -<a class="sourceLine" id="cb731-11" data-line-number="11"> res, </a> -<a class="sourceLine" id="cb731-12" data-line-number="12"> <span class="dt">colour_by =</span> <span class="st">"cell_type2"</span></a> -<a class="sourceLine" id="cb731-13" data-line-number="13">)</a> -<a class="sourceLine" id="cb731-14" data-line-number="14"><span class="kw">plotExpression</span>(res, <span class="kw">c</span>(<span class="st">"Sox2"</span>, <span class="st">"Eomes"</span>, <span class="st">"Zscan4d"</span>, <span class="st">"Fgf4"</span>))</a></code></pre></div> +<div class="sourceCode" id="cb814"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb814-1" data-line-number="1">deng <-<span class="st"> </span><span class="kw">normalize</span>(deng)</a> +<a class="sourceLine" id="cb814-2" data-line-number="2">res <-<span class="st"> </span><span class="kw">DrImpute</span>(deng<span class="op">@</span>assays[[<span class="st">"logcounts"</span>]], <span class="dt">ks=</span><span class="dv">8</span><span class="op">:</span><span class="dv">12</span>)</a> +<a class="sourceLine" id="cb814-3" data-line-number="3"><span class="kw">colnames</span>(res) <-<span class="st"> </span><span class="kw">colnames</span>(deng)</a> +<a class="sourceLine" id="cb814-4" data-line-number="4"><span class="kw">rownames</span>(res) <-<span class="st"> </span><span class="kw">rownames</span>(deng)</a> +<a class="sourceLine" id="cb814-5" data-line-number="5">res <-<span class="st"> </span><span class="kw">SingleCellExperiment</span>(</a> +<a class="sourceLine" id="cb814-6" data-line-number="6"> <span class="dt">assays =</span> <span class="kw">list</span>(<span class="dt">logcounts =</span> <span class="kw">as.matrix</span>(res)), </a> +<a class="sourceLine" id="cb814-7" data-line-number="7"> <span class="dt">colData =</span> <span class="kw">colData</span>(deng)</a> +<a class="sourceLine" id="cb814-8" data-line-number="8">)</a> +<a class="sourceLine" id="cb814-9" data-line-number="9"><span class="kw">rowData</span>(res)<span class="op">$</span>feature_symbol <-<span class="st"> </span><span class="kw">rowData</span>(deng)<span class="op">$</span>feature_symbol</a> +<a class="sourceLine" id="cb814-10" data-line-number="10"><span class="kw">plotPCA</span>(</a> +<a class="sourceLine" id="cb814-11" data-line-number="11"> res, </a> +<a class="sourceLine" id="cb814-12" data-line-number="12"> <span class="dt">colour_by =</span> <span class="st">"cell_type2"</span></a> +<a class="sourceLine" id="cb814-13" data-line-number="13">)</a> +<a class="sourceLine" id="cb814-14" data-line-number="14"><span class="kw">plotExpression</span>(res, <span class="kw">c</span>(<span class="st">"Sox2"</span>, <span class="st">"Eomes"</span>, <span class="st">"Zscan4d"</span>, <span class="st">"Fgf4"</span>))</a></code></pre></div> <p><strong>Exercise:</strong> Check the sc3 clustering of the DrImpute matrix, do you think that imputation using <code>DrImpute</code> is a good idea for the Deng dataset?</p> <p><strong>Exercise:</strong> What is the difference between <code>scImpute</code> and <code>DrImpute</code> based on the PCA and clustering analysis? Which one do you think is best to use?</p> </div> diff --git a/public/index.html b/public/index.html index b50d77f080d90047f6902845ebddb4de65870be6..07fb14f5fe8a44551ca2980542038ffce7c03fc4 100644 --- a/public/index.html +++ b/public/index.html @@ -339,7 +339,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.4.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-3"><i class="fa fa-check"></i><b>7.4.6</b> sessionInfo()</a></li> </ul></li> <li class="chapter" data-level="7.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#identifying-confounding-factors-reads"><i class="fa fa-check"></i><b>7.5</b> Identifying confounding factors (Reads)</a></li> -<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#dealing-with-confounders"><i class="fa fa-check"></i><b>7.6</b> Dealing with confounders</a><ul> +<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#batch-effects"><i class="fa fa-check"></i><b>7.6</b> Batch effects</a><ul> <li class="chapter" data-level="7.6.1" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#introduction-6"><i class="fa fa-check"></i><b>7.6.1</b> Introduction</a></li> <li class="chapter" data-level="7.6.2" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#linear-models"><i class="fa fa-check"></i><b>7.6.2</b> Linear models</a></li> <li class="chapter" data-level="7.6.3" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sctransform-2"><i class="fa fa-check"></i><b>7.6.3</b> sctransform</a></li> @@ -347,7 +347,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.6.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#combat"><i class="fa fa-check"></i><b>7.6.5</b> Combat</a></li> <li class="chapter" data-level="7.6.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#mnncorrect"><i class="fa fa-check"></i><b>7.6.6</b> mnnCorrect</a></li> <li class="chapter" data-level="7.6.7" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#harmony"><i class="fa fa-check"></i><b>7.6.7</b> Harmony</a></li> -<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-confounder-removal-strategies"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare confounder removal strategies</a></li> +<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-batch-correction"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare batch correction</a></li> <li class="chapter" data-level="7.6.9" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#big-exercise-2"><i class="fa fa-check"></i><b>7.6.9</b> Big Exercise</a></li> <li class="chapter" data-level="7.6.10" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-4"><i class="fa fa-check"></i><b>7.6.10</b> sessionInfo()</a></li> </ul></li> @@ -370,14 +370,13 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="9.1.2" data-path="latent-spaces.html"><a href="latent-spaces.html#tsne-t-distributed-stochastic-neighbor-embedding"><i class="fa fa-check"></i><b>9.1.2</b> tSNE: t-Distributed Stochastic Neighbor Embedding</a></li> <li class="chapter" data-level="9.1.3" data-path="latent-spaces.html"><a href="latent-spaces.html#manifold-methods"><i class="fa fa-check"></i><b>9.1.3</b> Manifold methods</a></li> </ul></li> -<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a></li> +<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a><ul> +<li class="chapter" data-level="9.2.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom-interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.2.1</b> <span>Slalom</span>: Interpretable latent spaces</a></li> +</ul></li> <li class="chapter" data-level="9.3" data-path="latent-spaces.html"><a href="latent-spaces.html#autoencoders"><i class="fa fa-check"></i><b>9.3</b> Autoencoders</a><ul> <li class="chapter" data-level="9.3.1" data-path="latent-spaces.html"><a href="latent-spaces.html#background-and-some-notations"><i class="fa fa-check"></i><b>9.3.1</b> Background and some notations</a></li> <li class="chapter" data-level="9.3.2" data-path="latent-spaces.html"><a href="latent-spaces.html#objective"><i class="fa fa-check"></i><b>9.3.2</b> Objective</a></li> </ul></li> -<li class="chapter" data-level="9.4" data-path="latent-spaces.html"><a href="latent-spaces.html#interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.4</b> Interpretable latent spaces</a><ul> -<li class="chapter" data-level="9.4.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom"><i class="fa fa-check"></i><b>9.4.1</b> Slalom</a></li> -</ul></li> </ul></li> <li class="chapter" data-level="10" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html"><i class="fa fa-check"></i><b>10</b> Clustering and cell annotation</a><ul> <li class="chapter" data-level="10.1" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html#clustering-methods"><i class="fa fa-check"></i><b>10.1</b> Clustering Methods</a><ul> @@ -399,15 +398,16 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="11.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#first-look-at-deng-data"><i class="fa fa-check"></i><b>11.1</b> First look at Deng data</a><ul> <li class="chapter" data-level="11.1.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#tscan"><i class="fa fa-check"></i><b>11.1.1</b> TSCAN</a></li> <li class="chapter" data-level="11.1.2" data-path="trajectory-inference.html"><a href="trajectory-inference.html#slingshot"><i class="fa fa-check"></i><b>11.1.2</b> Slingshot</a></li> -<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.3</b> Monocle</a></li> -<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.4</b> Monocle 2</a></li> -<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 3</a></li> -<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.6</b> Diffusion maps</a></li> -<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.7</b> Other methods</a></li> -<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.8</b> Comparison of the methods</a></li> -<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.9</b> Expression of genes through time</a></li> -<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.10</b> dynverse</a></li> -<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.11</b> sessionInfo()</a></li> +<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#gam-general-additive-model-for-identifying-temporally-expressed-genes"><i class="fa fa-check"></i><b>11.1.3</b> GAM general additive model for identifying temporally expressed genes</a></li> +<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.4</b> Monocle</a></li> +<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 2</a></li> +<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.6</b> Monocle 3</a></li> +<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.7</b> Diffusion maps</a></li> +<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.8</b> Other methods</a></li> +<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.9</b> Comparison of the methods</a></li> +<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.10</b> Expression of genes through time</a></li> +<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.11</b> dynverse</a></li> +<li class="chapter" data-level="11.1.12" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.12</b> sessionInfo()</a></li> </ul></li> </ul></li> <li class="chapter" data-level="12" data-path="dechapter.html"><a href="dechapter.html"><i class="fa fa-check"></i><b>12</b> Differential Expression (DE) analysis</a><ul> @@ -595,7 +595,7 @@ course docker image which contains all the required packages.</p> instructions</a>. To run the course docker image (use <a href="https://hub.docker.com/r/svibiocellgen/mig_2019_scrnaseq-workshop?tab=tags">the latest version</a>):</p> -<pre><code>docker run -p 8888:8888 -e PASSWORD="jupyter" svibiocellgen/mig_2019_scrnaseq-workshop:latest</code></pre> +<pre><code>docker run -p 8888:8888 -e PASSWORD="jupyter" svibiocellgen/mig_2019_scrnaseq-workshop:v1.01</code></pre> <p>Then follow the instructions provided, e.g.:</p> <pre><code>To access the notebook, open this file in a browser: file:///home/jovyan/.local/share/jupyter/runtime/nbserver-6-open.html diff --git a/public/index.md b/public/index.md index c36179c702ab2f3853e76e41714b446d83517577..870be57068d583af6b2df3ca2f5e9bd5d1d52df3 100644 --- a/public/index.md +++ b/public/index.md @@ -109,7 +109,7 @@ docker image (use [the latest version](https://hub.docker.com/r/svibiocellgen/mig_2019_scrnaseq-workshop?tab=tags)): ``` -docker run -p 8888:8888 -e PASSWORD="jupyter" svibiocellgen/mig_2019_scrnaseq-workshop:latest +docker run -p 8888:8888 -e PASSWORD="jupyter" svibiocellgen/mig_2019_scrnaseq-workshop:v1.01 ``` Then follow the instructions provided, e.g.: diff --git a/public/integrating-single-cell-omics-datasets.html b/public/integrating-single-cell-omics-datasets.html index 38d98562c186b3661091bbe23e644ccce0d4eb14..fc78e01b094f34b3148c6c1bddf370f16063ff57 100644 --- a/public/integrating-single-cell-omics-datasets.html +++ b/public/integrating-single-cell-omics-datasets.html @@ -339,7 +339,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.4.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-3"><i class="fa fa-check"></i><b>7.4.6</b> sessionInfo()</a></li> </ul></li> <li class="chapter" data-level="7.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#identifying-confounding-factors-reads"><i class="fa fa-check"></i><b>7.5</b> Identifying confounding factors (Reads)</a></li> -<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#dealing-with-confounders"><i class="fa fa-check"></i><b>7.6</b> Dealing with confounders</a><ul> +<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#batch-effects"><i class="fa fa-check"></i><b>7.6</b> Batch effects</a><ul> <li class="chapter" data-level="7.6.1" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#introduction-6"><i class="fa fa-check"></i><b>7.6.1</b> Introduction</a></li> <li class="chapter" data-level="7.6.2" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#linear-models"><i class="fa fa-check"></i><b>7.6.2</b> Linear models</a></li> <li class="chapter" data-level="7.6.3" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sctransform-2"><i class="fa fa-check"></i><b>7.6.3</b> sctransform</a></li> @@ -347,7 +347,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.6.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#combat"><i class="fa fa-check"></i><b>7.6.5</b> Combat</a></li> <li class="chapter" data-level="7.6.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#mnncorrect"><i class="fa fa-check"></i><b>7.6.6</b> mnnCorrect</a></li> <li class="chapter" data-level="7.6.7" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#harmony"><i class="fa fa-check"></i><b>7.6.7</b> Harmony</a></li> -<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-confounder-removal-strategies"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare confounder removal strategies</a></li> +<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-batch-correction"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare batch correction</a></li> <li class="chapter" data-level="7.6.9" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#big-exercise-2"><i class="fa fa-check"></i><b>7.6.9</b> Big Exercise</a></li> <li class="chapter" data-level="7.6.10" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-4"><i class="fa fa-check"></i><b>7.6.10</b> sessionInfo()</a></li> </ul></li> @@ -370,14 +370,13 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="9.1.2" data-path="latent-spaces.html"><a href="latent-spaces.html#tsne-t-distributed-stochastic-neighbor-embedding"><i class="fa fa-check"></i><b>9.1.2</b> tSNE: t-Distributed Stochastic Neighbor Embedding</a></li> <li class="chapter" data-level="9.1.3" data-path="latent-spaces.html"><a href="latent-spaces.html#manifold-methods"><i class="fa fa-check"></i><b>9.1.3</b> Manifold methods</a></li> </ul></li> -<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a></li> +<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a><ul> +<li class="chapter" data-level="9.2.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom-interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.2.1</b> <span>Slalom</span>: Interpretable latent spaces</a></li> +</ul></li> <li class="chapter" data-level="9.3" data-path="latent-spaces.html"><a href="latent-spaces.html#autoencoders"><i class="fa fa-check"></i><b>9.3</b> Autoencoders</a><ul> <li class="chapter" data-level="9.3.1" data-path="latent-spaces.html"><a href="latent-spaces.html#background-and-some-notations"><i class="fa fa-check"></i><b>9.3.1</b> Background and some notations</a></li> <li class="chapter" data-level="9.3.2" data-path="latent-spaces.html"><a href="latent-spaces.html#objective"><i class="fa fa-check"></i><b>9.3.2</b> Objective</a></li> </ul></li> -<li class="chapter" data-level="9.4" data-path="latent-spaces.html"><a href="latent-spaces.html#interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.4</b> Interpretable latent spaces</a><ul> -<li class="chapter" data-level="9.4.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom"><i class="fa fa-check"></i><b>9.4.1</b> Slalom</a></li> -</ul></li> </ul></li> <li class="chapter" data-level="10" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html"><i class="fa fa-check"></i><b>10</b> Clustering and cell annotation</a><ul> <li class="chapter" data-level="10.1" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html#clustering-methods"><i class="fa fa-check"></i><b>10.1</b> Clustering Methods</a><ul> @@ -399,15 +398,16 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="11.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#first-look-at-deng-data"><i class="fa fa-check"></i><b>11.1</b> First look at Deng data</a><ul> <li class="chapter" data-level="11.1.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#tscan"><i class="fa fa-check"></i><b>11.1.1</b> TSCAN</a></li> <li class="chapter" data-level="11.1.2" data-path="trajectory-inference.html"><a href="trajectory-inference.html#slingshot"><i class="fa fa-check"></i><b>11.1.2</b> Slingshot</a></li> -<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.3</b> Monocle</a></li> -<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.4</b> Monocle 2</a></li> -<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 3</a></li> -<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.6</b> Diffusion maps</a></li> -<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.7</b> Other methods</a></li> -<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.8</b> Comparison of the methods</a></li> -<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.9</b> Expression of genes through time</a></li> -<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.10</b> dynverse</a></li> -<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.11</b> sessionInfo()</a></li> +<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#gam-general-additive-model-for-identifying-temporally-expressed-genes"><i class="fa fa-check"></i><b>11.1.3</b> GAM general additive model for identifying temporally expressed genes</a></li> +<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.4</b> Monocle</a></li> +<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 2</a></li> +<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.6</b> Monocle 3</a></li> +<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.7</b> Diffusion maps</a></li> +<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.8</b> Other methods</a></li> +<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.9</b> Comparison of the methods</a></li> +<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.10</b> Expression of genes through time</a></li> +<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.11</b> dynverse</a></li> +<li class="chapter" data-level="11.1.12" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.12</b> sessionInfo()</a></li> </ul></li> </ul></li> <li class="chapter" data-level="12" data-path="dechapter.html"><a href="dechapter.html"><i class="fa fa-check"></i><b>12</b> Differential Expression (DE) analysis</a><ul> @@ -507,7 +507,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <div id="integrating-single-cell-omics-datasets" class="section level1"> <h1><span class="header-section-number">15</span> Integrating single-cell ’omics datasets</h1> -<div class="sourceCode" id="cb783"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb783-1" data-line-number="1"><span class="kw">set.seed</span>(<span class="dv">1234567</span>)</a></code></pre></div> +<div class="sourceCode" id="cb866"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb866-1" data-line-number="1"><span class="kw">set.seed</span>(<span class="dv">1234567</span>)</a></code></pre></div> </div> </section> diff --git a/public/intro-to-R.md b/public/intro-to-R.md index 2080ffb4fa782d84265f61b35ae04a577d817f73..59aa06ccf016c20b9aec186e387c21512015f648 100644 --- a/public/intro-to-R.md +++ b/public/intro-to-R.md @@ -562,7 +562,7 @@ ll ## $even_a_function ## function (..., deparse.level = 1) ## .Internal(cbind(deparse.level, ...)) -## <bytecode: 0x55bacbd9d118> +## <bytecode: 0x5600bf7f70f8> ## <environment: namespace:base> ``` diff --git a/public/introduction-to-rbioconductor.html b/public/introduction-to-rbioconductor.html index 89f40ef74188fc3a669f5f59c292a18b5111aa0c..b930fde493ff8ec0ac6a0a8e16e7f8731b71abac 100644 --- a/public/introduction-to-rbioconductor.html +++ b/public/introduction-to-rbioconductor.html @@ -339,7 +339,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.4.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-3"><i class="fa fa-check"></i><b>7.4.6</b> sessionInfo()</a></li> </ul></li> <li class="chapter" data-level="7.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#identifying-confounding-factors-reads"><i class="fa fa-check"></i><b>7.5</b> Identifying confounding factors (Reads)</a></li> -<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#dealing-with-confounders"><i class="fa fa-check"></i><b>7.6</b> Dealing with confounders</a><ul> +<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#batch-effects"><i class="fa fa-check"></i><b>7.6</b> Batch effects</a><ul> <li class="chapter" data-level="7.6.1" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#introduction-6"><i class="fa fa-check"></i><b>7.6.1</b> Introduction</a></li> <li class="chapter" data-level="7.6.2" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#linear-models"><i class="fa fa-check"></i><b>7.6.2</b> Linear models</a></li> <li class="chapter" data-level="7.6.3" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sctransform-2"><i class="fa fa-check"></i><b>7.6.3</b> sctransform</a></li> @@ -347,7 +347,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.6.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#combat"><i class="fa fa-check"></i><b>7.6.5</b> Combat</a></li> <li class="chapter" data-level="7.6.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#mnncorrect"><i class="fa fa-check"></i><b>7.6.6</b> mnnCorrect</a></li> <li class="chapter" data-level="7.6.7" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#harmony"><i class="fa fa-check"></i><b>7.6.7</b> Harmony</a></li> -<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-confounder-removal-strategies"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare confounder removal strategies</a></li> +<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-batch-correction"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare batch correction</a></li> <li class="chapter" data-level="7.6.9" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#big-exercise-2"><i class="fa fa-check"></i><b>7.6.9</b> Big Exercise</a></li> <li class="chapter" data-level="7.6.10" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-4"><i class="fa fa-check"></i><b>7.6.10</b> sessionInfo()</a></li> </ul></li> @@ -370,14 +370,13 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="9.1.2" data-path="latent-spaces.html"><a href="latent-spaces.html#tsne-t-distributed-stochastic-neighbor-embedding"><i class="fa fa-check"></i><b>9.1.2</b> tSNE: t-Distributed Stochastic Neighbor Embedding</a></li> <li class="chapter" data-level="9.1.3" data-path="latent-spaces.html"><a href="latent-spaces.html#manifold-methods"><i class="fa fa-check"></i><b>9.1.3</b> Manifold methods</a></li> </ul></li> -<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a></li> +<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a><ul> +<li class="chapter" data-level="9.2.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom-interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.2.1</b> <span>Slalom</span>: Interpretable latent spaces</a></li> +</ul></li> <li class="chapter" data-level="9.3" data-path="latent-spaces.html"><a href="latent-spaces.html#autoencoders"><i class="fa fa-check"></i><b>9.3</b> Autoencoders</a><ul> <li class="chapter" data-level="9.3.1" data-path="latent-spaces.html"><a href="latent-spaces.html#background-and-some-notations"><i class="fa fa-check"></i><b>9.3.1</b> Background and some notations</a></li> <li class="chapter" data-level="9.3.2" data-path="latent-spaces.html"><a href="latent-spaces.html#objective"><i class="fa fa-check"></i><b>9.3.2</b> Objective</a></li> </ul></li> -<li class="chapter" data-level="9.4" data-path="latent-spaces.html"><a href="latent-spaces.html#interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.4</b> Interpretable latent spaces</a><ul> -<li class="chapter" data-level="9.4.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom"><i class="fa fa-check"></i><b>9.4.1</b> Slalom</a></li> -</ul></li> </ul></li> <li class="chapter" data-level="10" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html"><i class="fa fa-check"></i><b>10</b> Clustering and cell annotation</a><ul> <li class="chapter" data-level="10.1" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html#clustering-methods"><i class="fa fa-check"></i><b>10.1</b> Clustering Methods</a><ul> @@ -399,15 +398,16 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="11.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#first-look-at-deng-data"><i class="fa fa-check"></i><b>11.1</b> First look at Deng data</a><ul> <li class="chapter" data-level="11.1.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#tscan"><i class="fa fa-check"></i><b>11.1.1</b> TSCAN</a></li> <li class="chapter" data-level="11.1.2" data-path="trajectory-inference.html"><a href="trajectory-inference.html#slingshot"><i class="fa fa-check"></i><b>11.1.2</b> Slingshot</a></li> -<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.3</b> Monocle</a></li> -<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.4</b> Monocle 2</a></li> -<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 3</a></li> -<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.6</b> Diffusion maps</a></li> -<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.7</b> Other methods</a></li> -<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.8</b> Comparison of the methods</a></li> -<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.9</b> Expression of genes through time</a></li> -<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.10</b> dynverse</a></li> -<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.11</b> sessionInfo()</a></li> +<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#gam-general-additive-model-for-identifying-temporally-expressed-genes"><i class="fa fa-check"></i><b>11.1.3</b> GAM general additive model for identifying temporally expressed genes</a></li> +<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.4</b> Monocle</a></li> +<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 2</a></li> +<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.6</b> Monocle 3</a></li> +<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.7</b> Diffusion maps</a></li> +<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.8</b> Other methods</a></li> +<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.9</b> Comparison of the methods</a></li> +<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.10</b> Expression of genes through time</a></li> +<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.11</b> dynverse</a></li> +<li class="chapter" data-level="11.1.12" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.12</b> sessionInfo()</a></li> </ul></li> </ul></li> <li class="chapter" data-level="12" data-path="dechapter.html"><a href="dechapter.html"><i class="fa fa-check"></i><b>12</b> Differential Expression (DE) analysis</a><ul> @@ -749,7 +749,7 @@ If we combine a character vector and a numeric vector into a matrix, all the dat ## $even_a_function ## function (..., deparse.level = 1) ## .Internal(cbind(deparse.level, ...)) -## <bytecode: 0x55bacbd9d118> +## <bytecode: 0x5600bf7f70f8> ## <environment: namespace:base></code></pre> <p>Lists are most commonly used when returning a large number of results from a function that do not fit into any of the previous data structures.</p> </div> diff --git a/public/introduction-to-single-cell-rna-seq.html b/public/introduction-to-single-cell-rna-seq.html index defb0177fc126252e172b42bcc8aee3c0d2bcc20..25511a2e13e29dc545a53ecd8b4cb8979bd2c01e 100644 --- a/public/introduction-to-single-cell-rna-seq.html +++ b/public/introduction-to-single-cell-rna-seq.html @@ -339,7 +339,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.4.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-3"><i class="fa fa-check"></i><b>7.4.6</b> sessionInfo()</a></li> </ul></li> <li class="chapter" data-level="7.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#identifying-confounding-factors-reads"><i class="fa fa-check"></i><b>7.5</b> Identifying confounding factors (Reads)</a></li> -<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#dealing-with-confounders"><i class="fa fa-check"></i><b>7.6</b> Dealing with confounders</a><ul> +<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#batch-effects"><i class="fa fa-check"></i><b>7.6</b> Batch effects</a><ul> <li class="chapter" data-level="7.6.1" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#introduction-6"><i class="fa fa-check"></i><b>7.6.1</b> Introduction</a></li> <li class="chapter" data-level="7.6.2" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#linear-models"><i class="fa fa-check"></i><b>7.6.2</b> Linear models</a></li> <li class="chapter" data-level="7.6.3" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sctransform-2"><i class="fa fa-check"></i><b>7.6.3</b> sctransform</a></li> @@ -347,7 +347,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.6.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#combat"><i class="fa fa-check"></i><b>7.6.5</b> Combat</a></li> <li class="chapter" data-level="7.6.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#mnncorrect"><i class="fa fa-check"></i><b>7.6.6</b> mnnCorrect</a></li> <li class="chapter" data-level="7.6.7" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#harmony"><i class="fa fa-check"></i><b>7.6.7</b> Harmony</a></li> -<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-confounder-removal-strategies"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare confounder removal strategies</a></li> +<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-batch-correction"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare batch correction</a></li> <li class="chapter" data-level="7.6.9" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#big-exercise-2"><i class="fa fa-check"></i><b>7.6.9</b> Big Exercise</a></li> <li class="chapter" data-level="7.6.10" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-4"><i class="fa fa-check"></i><b>7.6.10</b> sessionInfo()</a></li> </ul></li> @@ -370,14 +370,13 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="9.1.2" data-path="latent-spaces.html"><a href="latent-spaces.html#tsne-t-distributed-stochastic-neighbor-embedding"><i class="fa fa-check"></i><b>9.1.2</b> tSNE: t-Distributed Stochastic Neighbor Embedding</a></li> <li class="chapter" data-level="9.1.3" data-path="latent-spaces.html"><a href="latent-spaces.html#manifold-methods"><i class="fa fa-check"></i><b>9.1.3</b> Manifold methods</a></li> </ul></li> -<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a></li> +<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a><ul> +<li class="chapter" data-level="9.2.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom-interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.2.1</b> <span>Slalom</span>: Interpretable latent spaces</a></li> +</ul></li> <li class="chapter" data-level="9.3" data-path="latent-spaces.html"><a href="latent-spaces.html#autoencoders"><i class="fa fa-check"></i><b>9.3</b> Autoencoders</a><ul> <li class="chapter" data-level="9.3.1" data-path="latent-spaces.html"><a href="latent-spaces.html#background-and-some-notations"><i class="fa fa-check"></i><b>9.3.1</b> Background and some notations</a></li> <li class="chapter" data-level="9.3.2" data-path="latent-spaces.html"><a href="latent-spaces.html#objective"><i class="fa fa-check"></i><b>9.3.2</b> Objective</a></li> </ul></li> -<li class="chapter" data-level="9.4" data-path="latent-spaces.html"><a href="latent-spaces.html#interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.4</b> Interpretable latent spaces</a><ul> -<li class="chapter" data-level="9.4.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom"><i class="fa fa-check"></i><b>9.4.1</b> Slalom</a></li> -</ul></li> </ul></li> <li class="chapter" data-level="10" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html"><i class="fa fa-check"></i><b>10</b> Clustering and cell annotation</a><ul> <li class="chapter" data-level="10.1" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html#clustering-methods"><i class="fa fa-check"></i><b>10.1</b> Clustering Methods</a><ul> @@ -399,15 +398,16 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="11.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#first-look-at-deng-data"><i class="fa fa-check"></i><b>11.1</b> First look at Deng data</a><ul> <li class="chapter" data-level="11.1.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#tscan"><i class="fa fa-check"></i><b>11.1.1</b> TSCAN</a></li> <li class="chapter" data-level="11.1.2" data-path="trajectory-inference.html"><a href="trajectory-inference.html#slingshot"><i class="fa fa-check"></i><b>11.1.2</b> Slingshot</a></li> -<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.3</b> Monocle</a></li> -<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.4</b> Monocle 2</a></li> -<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 3</a></li> -<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.6</b> Diffusion maps</a></li> -<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.7</b> Other methods</a></li> -<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.8</b> Comparison of the methods</a></li> -<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.9</b> Expression of genes through time</a></li> -<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.10</b> dynverse</a></li> -<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.11</b> sessionInfo()</a></li> +<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#gam-general-additive-model-for-identifying-temporally-expressed-genes"><i class="fa fa-check"></i><b>11.1.3</b> GAM general additive model for identifying temporally expressed genes</a></li> +<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.4</b> Monocle</a></li> +<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 2</a></li> +<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.6</b> Monocle 3</a></li> +<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.7</b> Diffusion maps</a></li> +<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.8</b> Other methods</a></li> +<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.9</b> Comparison of the methods</a></li> +<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.10</b> Expression of genes through time</a></li> +<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.11</b> dynverse</a></li> +<li class="chapter" data-level="11.1.12" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.12</b> sessionInfo()</a></li> </ul></li> </ul></li> <li class="chapter" data-level="12" data-path="dechapter.html"><a href="dechapter.html"><i class="fa fa-check"></i><b>12</b> Differential Expression (DE) analysis</a><ul> diff --git a/public/latent-spaces.html b/public/latent-spaces.html index 139298973df072a2fab80a8a0fc36f68349ce876..ef3d3eb25a5ae5e3cc1e51034d7353bc593e1bb8 100644 --- a/public/latent-spaces.html +++ b/public/latent-spaces.html @@ -339,7 +339,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.4.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-3"><i class="fa fa-check"></i><b>7.4.6</b> sessionInfo()</a></li> </ul></li> <li class="chapter" data-level="7.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#identifying-confounding-factors-reads"><i class="fa fa-check"></i><b>7.5</b> Identifying confounding factors (Reads)</a></li> -<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#dealing-with-confounders"><i class="fa fa-check"></i><b>7.6</b> Dealing with confounders</a><ul> +<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#batch-effects"><i class="fa fa-check"></i><b>7.6</b> Batch effects</a><ul> <li class="chapter" data-level="7.6.1" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#introduction-6"><i class="fa fa-check"></i><b>7.6.1</b> Introduction</a></li> <li class="chapter" data-level="7.6.2" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#linear-models"><i class="fa fa-check"></i><b>7.6.2</b> Linear models</a></li> <li class="chapter" data-level="7.6.3" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sctransform-2"><i class="fa fa-check"></i><b>7.6.3</b> sctransform</a></li> @@ -347,7 +347,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.6.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#combat"><i class="fa fa-check"></i><b>7.6.5</b> Combat</a></li> <li class="chapter" data-level="7.6.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#mnncorrect"><i class="fa fa-check"></i><b>7.6.6</b> mnnCorrect</a></li> <li class="chapter" data-level="7.6.7" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#harmony"><i class="fa fa-check"></i><b>7.6.7</b> Harmony</a></li> -<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-confounder-removal-strategies"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare confounder removal strategies</a></li> +<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-batch-correction"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare batch correction</a></li> <li class="chapter" data-level="7.6.9" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#big-exercise-2"><i class="fa fa-check"></i><b>7.6.9</b> Big Exercise</a></li> <li class="chapter" data-level="7.6.10" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-4"><i class="fa fa-check"></i><b>7.6.10</b> sessionInfo()</a></li> </ul></li> @@ -370,14 +370,13 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="9.1.2" data-path="latent-spaces.html"><a href="latent-spaces.html#tsne-t-distributed-stochastic-neighbor-embedding"><i class="fa fa-check"></i><b>9.1.2</b> tSNE: t-Distributed Stochastic Neighbor Embedding</a></li> <li class="chapter" data-level="9.1.3" data-path="latent-spaces.html"><a href="latent-spaces.html#manifold-methods"><i class="fa fa-check"></i><b>9.1.3</b> Manifold methods</a></li> </ul></li> -<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a></li> +<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a><ul> +<li class="chapter" data-level="9.2.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom-interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.2.1</b> <span>Slalom</span>: Interpretable latent spaces</a></li> +</ul></li> <li class="chapter" data-level="9.3" data-path="latent-spaces.html"><a href="latent-spaces.html#autoencoders"><i class="fa fa-check"></i><b>9.3</b> Autoencoders</a><ul> <li class="chapter" data-level="9.3.1" data-path="latent-spaces.html"><a href="latent-spaces.html#background-and-some-notations"><i class="fa fa-check"></i><b>9.3.1</b> Background and some notations</a></li> <li class="chapter" data-level="9.3.2" data-path="latent-spaces.html"><a href="latent-spaces.html#objective"><i class="fa fa-check"></i><b>9.3.2</b> Objective</a></li> </ul></li> -<li class="chapter" data-level="9.4" data-path="latent-spaces.html"><a href="latent-spaces.html#interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.4</b> Interpretable latent spaces</a><ul> -<li class="chapter" data-level="9.4.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom"><i class="fa fa-check"></i><b>9.4.1</b> Slalom</a></li> -</ul></li> </ul></li> <li class="chapter" data-level="10" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html"><i class="fa fa-check"></i><b>10</b> Clustering and cell annotation</a><ul> <li class="chapter" data-level="10.1" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html#clustering-methods"><i class="fa fa-check"></i><b>10.1</b> Clustering Methods</a><ul> @@ -399,15 +398,16 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="11.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#first-look-at-deng-data"><i class="fa fa-check"></i><b>11.1</b> First look at Deng data</a><ul> <li class="chapter" data-level="11.1.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#tscan"><i class="fa fa-check"></i><b>11.1.1</b> TSCAN</a></li> <li class="chapter" data-level="11.1.2" data-path="trajectory-inference.html"><a href="trajectory-inference.html#slingshot"><i class="fa fa-check"></i><b>11.1.2</b> Slingshot</a></li> -<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.3</b> Monocle</a></li> -<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.4</b> Monocle 2</a></li> -<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 3</a></li> -<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.6</b> Diffusion maps</a></li> -<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.7</b> Other methods</a></li> -<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.8</b> Comparison of the methods</a></li> -<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.9</b> Expression of genes through time</a></li> -<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.10</b> dynverse</a></li> -<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.11</b> sessionInfo()</a></li> +<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#gam-general-additive-model-for-identifying-temporally-expressed-genes"><i class="fa fa-check"></i><b>11.1.3</b> GAM general additive model for identifying temporally expressed genes</a></li> +<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.4</b> Monocle</a></li> +<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 2</a></li> +<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.6</b> Monocle 3</a></li> +<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.7</b> Diffusion maps</a></li> +<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.8</b> Other methods</a></li> +<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.9</b> Comparison of the methods</a></li> +<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.10</b> Expression of genes through time</a></li> +<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.11</b> dynverse</a></li> +<li class="chapter" data-level="11.1.12" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.12</b> sessionInfo()</a></li> </ul></li> </ul></li> <li class="chapter" data-level="12" data-path="dechapter.html"><a href="dechapter.html"><i class="fa fa-check"></i><b>12</b> Differential Expression (DE) analysis</a><ul> @@ -573,19 +573,19 @@ largest eigen value accounts for the most variation.</p> </div> <div id="an-example-of-pca" class="section level5"> <h5><span class="header-section-number">9.1.1.1.3</span> An example of PCA</h5> -<div class="sourceCode" id="cb584"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb584-1" data-line-number="1">deng <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/deng/deng-reads.rds"</span>)</a> -<a class="sourceLine" id="cb584-2" data-line-number="2">my_color1 <-<span class="st"> </span><span class="kw">createPalette</span>(<span class="dv">6</span>, <span class="kw">c</span>(<span class="st">"#010101"</span>, <span class="st">"#ff0000"</span>), <span class="dt">M=</span><span class="dv">1000</span>)</a> -<a class="sourceLine" id="cb584-3" data-line-number="3"><span class="kw">names</span>(my_color1) <-<span class="st"> </span><span class="kw">unique</span>(<span class="kw">as.character</span>(deng<span class="op">$</span>cell_type1))</a> -<a class="sourceLine" id="cb584-4" data-line-number="4">my_color2 <-<span class="st"> </span><span class="kw">createPalette</span>(<span class="dv">10</span>, <span class="kw">c</span>(<span class="st">"#010101"</span>, <span class="st">"#ff0000"</span>), <span class="dt">M=</span><span class="dv">1000</span>)</a> -<a class="sourceLine" id="cb584-5" data-line-number="5"><span class="kw">names</span>(my_color2) <-<span class="st"> </span><span class="kw">unique</span>(<span class="kw">as.character</span>(deng<span class="op">$</span>cell_type2))</a> -<a class="sourceLine" id="cb584-6" data-line-number="6">deng <-<span class="st"> </span><span class="kw">runPCA</span>(deng, <span class="dt">ncomponents =</span> <span class="dv">2</span>)</a> -<a class="sourceLine" id="cb584-7" data-line-number="7"><span class="kw">plotPCA</span>(deng, <span class="dt">colour_by =</span> <span class="st">"cell_type1"</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb584-8" data-line-number="8"><span class="st"> </span><span class="kw">scale_fill_manual</span>(<span class="dt">values =</span> my_color1)</a></code></pre></div> +<div class="sourceCode" id="cb652"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb652-1" data-line-number="1">deng <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/deng/deng-reads.rds"</span>)</a> +<a class="sourceLine" id="cb652-2" data-line-number="2">my_color1 <-<span class="st"> </span><span class="kw">createPalette</span>(<span class="dv">6</span>, <span class="kw">c</span>(<span class="st">"#010101"</span>, <span class="st">"#ff0000"</span>), <span class="dt">M=</span><span class="dv">1000</span>)</a> +<a class="sourceLine" id="cb652-3" data-line-number="3"><span class="kw">names</span>(my_color1) <-<span class="st"> </span><span class="kw">unique</span>(<span class="kw">as.character</span>(deng<span class="op">$</span>cell_type1))</a> +<a class="sourceLine" id="cb652-4" data-line-number="4">my_color2 <-<span class="st"> </span><span class="kw">createPalette</span>(<span class="dv">10</span>, <span class="kw">c</span>(<span class="st">"#010101"</span>, <span class="st">"#ff0000"</span>), <span class="dt">M=</span><span class="dv">1000</span>)</a> +<a class="sourceLine" id="cb652-5" data-line-number="5"><span class="kw">names</span>(my_color2) <-<span class="st"> </span><span class="kw">unique</span>(<span class="kw">as.character</span>(deng<span class="op">$</span>cell_type2))</a> +<a class="sourceLine" id="cb652-6" data-line-number="6">deng <-<span class="st"> </span><span class="kw">runPCA</span>(deng, <span class="dt">ncomponents =</span> <span class="dv">2</span>)</a> +<a class="sourceLine" id="cb652-7" data-line-number="7"><span class="kw">plotPCA</span>(deng, <span class="dt">colour_by =</span> <span class="st">"cell_type1"</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb652-8" data-line-number="8"><span class="st"> </span><span class="kw">scale_fill_manual</span>(<span class="dt">values =</span> my_color1)</a></code></pre></div> <pre><code>## Scale for 'fill' is already present. Adding another scale for 'fill', ## which will replace the existing scale.</code></pre> <p><img src="latent-spaces_files/figure-html/unnamed-chunk-1-1.png" width="672" style="display: block; margin: auto;" /></p> -<div class="sourceCode" id="cb586"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb586-1" data-line-number="1"><span class="kw">plotPCA</span>(deng, <span class="dt">colour_by =</span> <span class="st">"cell_type2"</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb586-2" data-line-number="2"><span class="st"> </span><span class="kw">scale_fill_manual</span>(<span class="dt">values =</span> my_color2)</a></code></pre></div> +<div class="sourceCode" id="cb654"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb654-1" data-line-number="1"><span class="kw">plotPCA</span>(deng, <span class="dt">colour_by =</span> <span class="st">"cell_type2"</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb654-2" data-line-number="2"><span class="st"> </span><span class="kw">scale_fill_manual</span>(<span class="dt">values =</span> my_color2)</a></code></pre></div> <pre><code>## Scale for 'fill' is already present. Adding another scale for 'fill', ## which will replace the existing scale.</code></pre> <p><img src="latent-spaces_files/figure-html/unnamed-chunk-1-2.png" width="672" style="display: block; margin: auto;" /></p> @@ -608,7 +608,9 @@ following structure. </div> </div> <div id="glm-pca" class="section level4"> -<h4><span class="header-section-number">9.1.1.2</span> GLM-PCA</h4> +<h4><span class="header-section-number">9.1.1.2</span> <a href="https://rdrr.io/cran/glmpca/">GLM-PCA</a></h4> +<p><span class="citation">(Collins, Dasgupta, and Schapire <a href="#ref-collins2002generalization">2002</a>)</span> +<span class="citation">(Townes et al. <a href="#ref-townes2019feature">2019</a>)</span></p> <p>GLM-PCA is a generalized version of the traditional PCA.</p> <p>The traditional PCA implicitly imposes an assumption of Gaussian distribution. The purpose of GLM-PCA is to loosen this condition to accommodate other @@ -631,43 +633,34 @@ exponential-family distribution, and applies appropriate link functions to <span class="math inline">\(u_i\)</span>’s in the same as a GLM does to non-Gaussian responses.</p> <p>The following example compares GLM-PCA with Poisson marginals to the traditional PCA, which is identical to the result from <code>plotPCA</code>.</p> -<div class="sourceCode" id="cb588"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb588-1" data-line-number="1"><span class="co">## GLM-PCA</span></a> -<a class="sourceLine" id="cb588-2" data-line-number="2">Y <-<span class="st"> </span><span class="kw">assay</span>(deng, <span class="st">"counts"</span>)</a> -<a class="sourceLine" id="cb588-3" data-line-number="3">Y <-<span class="st"> </span>Y[<span class="kw">rowSums</span>(Y) <span class="op">></span><span class="st"> </span><span class="dv">0</span>, ]</a> -<a class="sourceLine" id="cb588-4" data-line-number="4"><span class="kw">system.time</span>(res1 <-<span class="st"> </span><span class="kw">glmpca</span>(Y, <span class="dt">L=</span><span class="dv">2</span>, <span class="dt">fam=</span><span class="st">"poi"</span>, <span class="dt">verbose=</span><span class="ot">TRUE</span>))</a></code></pre></div> +<div class="sourceCode" id="cb656"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb656-1" data-line-number="1"><span class="co">## GLM-PCA</span></a> +<a class="sourceLine" id="cb656-2" data-line-number="2">Y <-<span class="st"> </span><span class="kw">assay</span>(deng, <span class="st">"counts"</span>)</a> +<a class="sourceLine" id="cb656-3" data-line-number="3">Y <-<span class="st"> </span>Y[<span class="kw">rowSums</span>(Y) <span class="op">></span><span class="st"> </span><span class="dv">0</span>, ]</a> +<a class="sourceLine" id="cb656-4" data-line-number="4"><span class="kw">system.time</span>(res1 <-<span class="st"> </span><span class="kw">glmpca</span>(Y, <span class="dt">L=</span><span class="dv">2</span>, <span class="dt">fam=</span><span class="st">"poi"</span>, <span class="dt">verbose=</span><span class="ot">TRUE</span>))</a></code></pre></div> <pre><code>## user system elapsed -## 82.313 22.987 105.317</code></pre> -<div class="sourceCode" id="cb590"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb590-1" data-line-number="1">pd1 <-<span class="st"> </span><span class="kw">data.frame</span>(res1<span class="op">$</span>factors, <span class="dt">dimreduce=</span><span class="st">"glmpca-poisson"</span>, <span class="dt">clust =</span> <span class="kw">factor</span>(deng<span class="op">$</span>cell_type2))</a> -<a class="sourceLine" id="cb590-2" data-line-number="2"><span class="co">## traditional PCA</span></a> -<a class="sourceLine" id="cb590-3" data-line-number="3">pd2 <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="kw">reducedDim</span>(deng, <span class="st">"PCA"</span>), <span class="dt">dimreduce=</span><span class="st">"runPCA"</span>, <span class="dt">clust =</span> <span class="kw">factor</span>(deng<span class="op">$</span>cell_type2))</a> -<a class="sourceLine" id="cb590-4" data-line-number="4"><span class="kw">colnames</span>(pd2) <-<span class="st"> </span><span class="kw">colnames</span>(pd1)</a> -<a class="sourceLine" id="cb590-5" data-line-number="5"><span class="co">## plot</span></a> -<a class="sourceLine" id="cb590-6" data-line-number="6">pd <-<span class="st"> </span><span class="kw">rbind</span>(pd1, pd2)</a> -<a class="sourceLine" id="cb590-7" data-line-number="7"><span class="kw">ggplot</span>(pd, <span class="kw">aes</span>(<span class="dt">x =</span> dim1, <span class="dt">y =</span> dim2, <span class="dt">colour =</span> clust)) <span class="op">+</span></a> -<a class="sourceLine" id="cb590-8" data-line-number="8"><span class="st"> </span><span class="kw">geom_point</span>(<span class="dt">size=</span><span class="dv">2</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb590-9" data-line-number="9"><span class="st"> </span><span class="kw">facet_wrap</span>(<span class="op">~</span>dimreduce, <span class="dt">scales=</span><span class="st">"free"</span>, <span class="dt">nrow=</span><span class="dv">3</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb590-10" data-line-number="10"><span class="st"> </span><span class="kw">scale_color_manual</span>(<span class="dt">values =</span> my_color2) <span class="op">+</span></a> -<a class="sourceLine" id="cb590-11" data-line-number="11"><span class="st"> </span><span class="kw">theme_bw</span>()</a></code></pre></div> +## 94.261 25.207 119.499</code></pre> +<div class="sourceCode" id="cb658"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb658-1" data-line-number="1">pd1 <-<span class="st"> </span><span class="kw">data.frame</span>(res1<span class="op">$</span>factors, <span class="dt">dimreduce=</span><span class="st">"glmpca-poisson"</span>, <span class="dt">clust =</span> <span class="kw">factor</span>(deng<span class="op">$</span>cell_type2))</a> +<a class="sourceLine" id="cb658-2" data-line-number="2"><span class="co">## traditional PCA</span></a> +<a class="sourceLine" id="cb658-3" data-line-number="3">pd2 <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="kw">reducedDim</span>(deng, <span class="st">"PCA"</span>), <span class="dt">dimreduce=</span><span class="st">"runPCA"</span>, <span class="dt">clust =</span> <span class="kw">factor</span>(deng<span class="op">$</span>cell_type2))</a> +<a class="sourceLine" id="cb658-4" data-line-number="4"><span class="kw">colnames</span>(pd2) <-<span class="st"> </span><span class="kw">colnames</span>(pd1)</a> +<a class="sourceLine" id="cb658-5" data-line-number="5"><span class="co">## plot</span></a> +<a class="sourceLine" id="cb658-6" data-line-number="6">pd <-<span class="st"> </span><span class="kw">rbind</span>(pd1, pd2)</a> +<a class="sourceLine" id="cb658-7" data-line-number="7"><span class="kw">ggplot</span>(pd, <span class="kw">aes</span>(<span class="dt">x =</span> dim1, <span class="dt">y =</span> dim2, <span class="dt">colour =</span> clust)) <span class="op">+</span></a> +<a class="sourceLine" id="cb658-8" data-line-number="8"><span class="st"> </span><span class="kw">geom_point</span>(<span class="dt">size=</span><span class="dv">2</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb658-9" data-line-number="9"><span class="st"> </span><span class="kw">facet_wrap</span>(<span class="op">~</span>dimreduce, <span class="dt">scales=</span><span class="st">"free"</span>, <span class="dt">nrow=</span><span class="dv">3</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb658-10" data-line-number="10"><span class="st"> </span><span class="kw">scale_color_manual</span>(<span class="dt">values =</span> my_color2) <span class="op">+</span></a> +<a class="sourceLine" id="cb658-11" data-line-number="11"><span class="st"> </span><span class="kw">theme_bw</span>()</a></code></pre></div> <p><img src="latent-spaces_files/figure-html/glmpca-1.png" width="672" style="display: block; margin: auto;" /></p> <p>Let us compare GLM-PCA and standard PCA (using normalized log-counts data) on the Tung data, before cells have been QC’d.</p> -<pre><code>## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its -## own size factors</code></pre> -<pre><code>## Warning in .get_all_sf_sets(object): spike-in set 'MT' should have its own -## size factors</code></pre> <p><img src="latent-spaces_files/figure-html/unnamed-chunk-2-1.png" width="672" style="display: block; margin: auto;" /></p> <p>Repeat these plots with the QC’d Tung data.</p> -<pre><code>## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its -## own size factors</code></pre> -<pre><code>## Warning in .get_all_sf_sets(object): spike-in set 'MT' should have its own -## size factors</code></pre> <p><img src="latent-spaces_files/figure-html/unnamed-chunk-3-1.png" width="672" style="display: block; margin: auto;" /></p> </div> </div> <div id="tsne-t-distributed-stochastic-neighbor-embedding" class="section level3"> <h3><span class="header-section-number">9.1.2</span> tSNE: t-Distributed Stochastic Neighbor Embedding</h3> -<p>t-SNE is an advanced version of the original SNE algorithm. <font color="red"> -[ref] </font></p> +<p>t-SNE <span class="citation">(Maaten and Hinton <a href="#ref-maaten2008visualizing">2008</a>)</span> is an advanced version of the original SNE algorithm. <span class="citation">(Hinton and Roweis <a href="#ref-hinton2003stochastic">2003</a>)</span></p> <div id="motivation" class="section level4"> <h4><span class="header-section-number">9.1.2.1</span> Motivation</h4> <p>The weakness of PCA is the motivation behind the SNE algorithm.</p> @@ -736,23 +729,22 @@ other - what happens when we embed in 1D? <img src="figures/crowding.png" style="width:60.0%" /> </center></li> <li><strong>Solution:</strong><br /> -Change the distribution of the low-dimensional data <span class="math inline">\(Q\)</span> into a student-t distribution.<br /> - +Change the distribution of the low-dimensional data <span class="math inline">\(Q\)</span> into a student-t distribution.</li> +</ul> <center> <img src="figures/t.png" style="width:50.0%" /> </center> -Recall that SNE is trying to minimize the dissimilarity of <span class="math inline">\(P\)</span> and <span class="math inline">\(Q\)</span>, +<p>Recall that SNE is trying to minimize the dissimilarity of <span class="math inline">\(P\)</span> and <span class="math inline">\(Q\)</span>, and <span class="math inline">\(P\)</span> has a Gaussian distribution. -So for a pair of points (<span class="math inline">\(x_i\)</span> and <span class="math inline">\(x_j\)</span> in high-dimension, <span class="math inline">\(y_i\)</span> and <span class="math inline">\(y_j\)</span> in low-dimension) to reach the same probability, the distance between <span class="math inline">\(y_i\)</span> and <span class="math inline">\(y_j\)</span> would be much larger (i.e. much farther apart).</li> -</ul> +So for a pair of points (<span class="math inline">\(x_i\)</span> and <span class="math inline">\(x_j\)</span> in high-dimension, <span class="math inline">\(y_i\)</span> and <span class="math inline">\(y_j\)</span> in low-dimension) to reach the same probability, the distance between <span class="math inline">\(y_i\)</span> and <span class="math inline">\(y_j\)</span> would be much larger (i.e. much farther apart).</p> </div> <div id="example-of-t-sne" class="section level4"> <h4><span class="header-section-number">9.1.2.4</span> <strong>Example of t-SNE:</strong> </h4> -<div class="sourceCode" id="cb595"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb595-1" data-line-number="1">muraro <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/pancreas/muraro.rds"</span>)</a> -<a class="sourceLine" id="cb595-2" data-line-number="2">tmp <-<span class="st"> </span><span class="kw">runTSNE</span>(muraro, <span class="dt">perplexity =</span> <span class="dv">3</span>)</a> -<a class="sourceLine" id="cb595-3" data-line-number="3"><span class="kw">plotTSNE</span>(tmp, <span class="dt">colour_by =</span> <span class="st">"cell_type1"</span>)</a> -<a class="sourceLine" id="cb595-4" data-line-number="4">tmp <-<span class="st"> </span><span class="kw">runTSNE</span>(muraro, <span class="dt">perplexity =</span> <span class="dv">50</span>)</a> -<a class="sourceLine" id="cb595-5" data-line-number="5"><span class="kw">plotTSNE</span>(tmp, <span class="dt">colour_by =</span> <span class="st">"cell_type1"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb659"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb659-1" data-line-number="1">muraro <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/pancreas/muraro.rds"</span>)</a> +<a class="sourceLine" id="cb659-2" data-line-number="2">tmp <-<span class="st"> </span><span class="kw">runTSNE</span>(muraro, <span class="dt">perplexity =</span> <span class="dv">3</span>)</a> +<a class="sourceLine" id="cb659-3" data-line-number="3"><span class="kw">plotTSNE</span>(tmp, <span class="dt">colour_by =</span> <span class="st">"cell_type1"</span>)</a> +<a class="sourceLine" id="cb659-4" data-line-number="4">tmp <-<span class="st"> </span><span class="kw">runTSNE</span>(muraro, <span class="dt">perplexity =</span> <span class="dv">50</span>)</a> +<a class="sourceLine" id="cb659-5" data-line-number="5"><span class="kw">plotTSNE</span>(tmp, <span class="dt">colour_by =</span> <span class="st">"cell_type1"</span>)</a></code></pre></div> <p><img src="latent-spaces_files/figure-html/tsne-1.png" width="672" style="display: block; margin: auto;" /><img src="latent-spaces_files/figure-html/tsne-2.png" width="672" style="display: block; margin: auto;" /></p> </div> <div id="limits-of-t-sne" class="section level4"> @@ -773,8 +765,8 @@ Therefore can merely be used for visualization.<br /> </div> <div id="manifold-methods" class="section level3"> <h3><span class="header-section-number">9.1.3</span> Manifold methods</h3> -<div id="umap-uniform-manifold-approximation-and-projection" class="section level4"> -<h4><span class="header-section-number">9.1.3.1</span> UMAP: Uniform Manifold Approximation and Projection</h4> +<div id="umap-uniform-manifold-approximation-and-projection-mcinnes2018umap" class="section level4"> +<h4><span class="header-section-number">9.1.3.1</span> UMAP: Uniform Manifold Approximation and Projection <span class="citation">(McInnes, Healy, and Melville <a href="#ref-mcinnes2018umap">2018</a>)</span></h4> <div id="advantages-of-umap-over-t-sne" class="section level5"> <h5><span class="header-section-number">9.1.3.1.1</span> <strong>Advantages of UMAP over t-SNE:</strong></h5> <ul> @@ -824,13 +816,13 @@ To be able to project this onto an undirected graph, we need to solve the disagr </div> <div id="example-of-umap" class="section level5"> <h5><span class="header-section-number">9.1.3.1.4</span> Example of UMAP</h5> -<div class="sourceCode" id="cb596"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb596-1" data-line-number="1">muraro <-<span class="st"> </span><span class="kw">runUMAP</span>(muraro)</a> -<a class="sourceLine" id="cb596-2" data-line-number="2"><span class="kw">plotUMAP</span>(muraro, <span class="dt">colour_by=</span><span class="st">"cell_type1"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb660"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb660-1" data-line-number="1">muraro <-<span class="st"> </span><span class="kw">runUMAP</span>(muraro)</a> +<a class="sourceLine" id="cb660-2" data-line-number="2"><span class="kw">plotUMAP</span>(muraro, <span class="dt">colour_by=</span><span class="st">"cell_type1"</span>)</a></code></pre></div> <p><img src="latent-spaces_files/figure-html/umap-1.png" width="672" style="display: block; margin: auto;" /></p> </div> </div> -<div id="phate" class="section level4"> -<h4><span class="header-section-number">9.1.3.2</span> PHATE</h4> +<div id="phate-moon2017phate" class="section level4"> +<h4><span class="header-section-number">9.1.3.2</span> PHATE <span class="citation">(Moon et al. <a href="#ref-moon2017phate">2017</a>)</span></h4> <div id="sketch-of-algorithm" class="section level5"> <h5><span class="header-section-number">9.1.3.2.1</span> Sketch of algorithm</h5> <ul> @@ -882,27 +874,124 @@ PHATE measures the distance between probability distributions <span class="math </div> <div id="example-of-phate" class="section level5"> <h5><span class="header-section-number">9.1.3.2.2</span> Example of PHATE</h5> +<div class="sourceCode" id="cb661"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb661-1" data-line-number="1"><span class="kw">library</span>(phateR)</a> +<a class="sourceLine" id="cb661-2" data-line-number="2">deng_phate <-<span class="st"> </span><span class="kw">phate</span>(<span class="kw">t</span>(<span class="kw">assay</span>(deng, <span class="st">"logcounts"</span>)))</a> +<a class="sourceLine" id="cb661-3" data-line-number="3">dt <-<span class="st"> </span><span class="kw">data.frame</span>(deng_phate<span class="op">$</span>embedding, <span class="dt">clust =</span> deng<span class="op">$</span>cell_type1)</a> +<a class="sourceLine" id="cb661-4" data-line-number="4"><span class="kw">palette</span>(<span class="kw">rainbow</span>(<span class="dv">10</span>))</a> +<a class="sourceLine" id="cb661-5" data-line-number="5"><span class="kw">ggplot</span>(dt, <span class="kw">aes</span>(<span class="dt">x=</span>PHATE1, <span class="dt">y=</span>PHATE2, <span class="dt">color=</span>clust)) <span class="op">+</span></a> +<a class="sourceLine" id="cb661-6" data-line-number="6"><span class="st"> </span><span class="kw">geom_point</span>()</a></code></pre></div> +<center> +<img src="figures/phate.png" /> +</center> </div> </div> </div> </div> <div id="matrix-factorization-and-factor-analysis" class="section level2"> <h2><span class="header-section-number">9.2</span> Matrix factorization and factor analysis</h2> -<p>Factor Analysis is similar to PCA in that, -they both aim to obtain a new set of distinct summary variables, -which are fewer in number than the original number of variables.</p> -<p>The key concept of factor analysis is that the original, observed variables are +<p><strong>The key concept of factor analysis</strong>: The original, observed variables are correlated because they are all associated with some unobservable variables, -called latent factors.</p> -<p>The variance of a variable can be splitted into two parts:<br /> +the <strong>latent factors</strong>.</p> +<p>It looks similar to PCA, but instead of dimensionality reduction, factor analysis +focuses on studying the latent factors.</p> +<p>The variance of an observed variable can be splitted into two parts:<br /> - Common variance: the part of variance that is explained by latent factors;<br /> -- Unique variance: the part that is specific to only one variable, usually considered as an error component or residual.</p> +- Unique variance: the part that is specific to only one variable, usually considered as an error component or <strong>residual</strong>.</p> +<p>The <strong>factor loadings</strong> or weights indicate how much each latent factor is affecting the observed features.</p> +<center> +<img src="figures/FA.png" style="width:60.0%" /> +</center> +<div id="slalom-interpretable-latent-spaces" class="section level3"> +<h3><span class="header-section-number">9.2.1</span> <a href="https://bioconductor.org/packages/release/bioc/html/slalom.html">Slalom</a>: Interpretable latent spaces</h3> +<p>Highlight of Slalom: <span class="citation">(Buettner et al. <a href="#ref-buettner2017f">2017</a>)</span></p> +<ul> +<li><p>It incorporates prior information to help the model estimation;</p></li> +<li><p>It learns whatever not provided by prior knowledge in the model training process;</p></li> +<li><p>It enforces sparsity in the weight matrix.</p></li> +</ul> +<div id="methodology" class="section level4"> +<h4><span class="header-section-number">9.2.1.1</span> Methodology</h4> +<p><strong>Matrix expression of factor analysis:</strong></p> +<center> +<img src="figures/FA_matrix.png" style="width:80.0%" /> +</center> +<p><strong>How prior knowledge affects the model:</strong></p> <center> -<img src="figures/FA.png" style="width:80.0%" /> +<img src="figures/slalom_anno.png" /> </center> +<ul> +<li><span class="math inline">\(I_{g, k}\)</span>: (observed) Indicator of whether a gene <span class="math inline">\(g\)</span> is annotated to a given pathway or factor <span class="math inline">\(k\)</span>;<br /> +</li> +<li><span class="math inline">\(z_{g, k}\)</span>: (latent) Indicator of whether factor <span class="math inline">\(k\)</span> has a regulatory effect on gene <span class="math inline">\(g\)</span>;<br /> +</li> +<li><span class="math inline">\(w_{g, k}\)</span>: (estimated) weights.</li> +</ul> +<p><strong>grey arrow</strong>: +<span class="math display">\[ P(I_{g, k}\vert z_{g, k}) = \begin{cases} +\text{Bernoulli}(p_1), \text{if } z_{g, k} = 1\\ +\text{Bernoulli}(p_2), \text{if } z_{g, k} = 0\\ +\end{cases}\]</span></p> +<p><strong>green arrow</strong>: +<span class="math display">\[ P(w_{g, k}\vert z_{g, k}) = \begin{cases} +N(w_{g, k}, 1/\alpha), \text{ if } z_{g, k} = 1\\ +\delta_0(w_{g, k}), \text{ if } z_{g, k} = 0\\ +\end{cases}\]</span></p> +<center> +<img src="figures/slab_spike.png" /> +</center> +<p>We only look at the part of the <strong>likelihood</strong> that is relavant to this part: +<span class="math inline">\(\prod_{g} \prod_{k}P(I_{g, k}, w_{g, k}, z_{g, k})\)</span>,<br /> +where <span class="math inline">\(P(I_{g, k}, w_{g, k}, z_{g, k}) = P(I_{g, k}, w_{g, k}| z_{g, k})P(z_{g,k}) = P( I_{g, k}| z_{g, k})P( w_{g, k}| z_{g, k})P(z_{g,k})\)</span>. +Since we do not know anything about <span class="math inline">\(z_{g,k}\)</span>, it is assumed as Bernoulli(1/2).</p> +</div> +<div id="example" class="section level4"> +<h4><span class="header-section-number">9.2.1.2</span> Example</h4> +<p>First, get a geneset in a <code>GeneSetCollection</code> object.</p> +<div class="sourceCode" id="cb662"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb662-1" data-line-number="1">gmtfile <-<span class="st"> </span><span class="kw">system.file</span>(<span class="st">"extdata"</span>, <span class="st">"reactome_subset.gmt"</span>, <span class="dt">package =</span> <span class="st">"slalom"</span>)</a> +<a class="sourceLine" id="cb662-2" data-line-number="2">genesets <-<span class="st"> </span>GSEABase<span class="op">::</span><span class="kw">getGmt</span>(gmtfile)</a></code></pre></div> +<p>Then we create an <code>Rcpp_SlalomModel</code> object containing the input data and genesets (and subsequent results) for the model.</p> +<div class="sourceCode" id="cb663"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb663-1" data-line-number="1">model_deng <-<span class="st"> </span><span class="kw">newSlalomModel</span>(deng, genesets, <span class="dt">n_hidden =</span> <span class="dv">5</span>, <span class="dt">min_genes =</span> <span class="dv">10</span>)</a></code></pre></div> +<pre><code>## 29 annotated factors retained; 1 annotated factors dropped. +## 1072 genes retained for analysis.</code></pre> +<p>Initialize the model:</p> +<div class="sourceCode" id="cb665"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb665-1" data-line-number="1">model_deng <-<span class="st"> </span><span class="kw">initSlalom</span>(model_deng, <span class="dt">seed =</span> <span class="dv">100</span>)</a></code></pre></div> +<p>Fit/train the model:</p> +<div class="sourceCode" id="cb666"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb666-1" data-line-number="1">model_deng <-<span class="st"> </span><span class="kw">trainSlalom</span>(model_deng, <span class="dt">nIterations =</span> <span class="dv">1000</span>, <span class="dt">seed =</span> <span class="dv">100</span>, <span class="dt">tolerance =</span> <span class="fl">0.001</span>)</a></code></pre></div> +<pre><code>## pre-training model for faster convergence +## iteration 0 +## Model not converged after 50 iterations. +## iteration 0 +## Model not converged after 50 iterations. +## iteration 0 +## Switched off factor 29 +## Switched off factor 20 +## Switched off factor 32 +## Switched off factor 28 +## Switched off factor 13 +## Switched off factor 27 +## Switched off factor 10 +## iteration 100 +## Switched off factor 22 +## iteration 200 +## iteration 300 +## iteration 400 +## iteration 500 +## iteration 600 +## iteration 700 +## Model converged after 701 iterations.</code></pre> +<p>View results:<br /> +The <code>plotRelevance</code> function displays the most relevant terms (factors/pathways) ranked by relevance, showing gene set size and the number of genes gained/lost as active in the pathway as learnt by the model.</p> +<div class="sourceCode" id="cb668"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb668-1" data-line-number="1"><span class="kw">plotRelevance</span>(model_deng)</a></code></pre></div> +<p><img src="latent-spaces_files/figure-html/unnamed-chunk-8-1.png" width="960" style="display: block; margin: auto;" /> +The <code>plotTerms</code> function shows the relevance of all terms in the model, enabling the identification of the most important pathways in the context of all that were included in the model.</p> +<div class="sourceCode" id="cb669"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb669-1" data-line-number="1"><span class="kw">plotTerms</span>(model_deng)</a></code></pre></div> +<p><img src="latent-spaces_files/figure-html/unnamed-chunk-9-1.png" width="672" style="display: block; margin: auto;" /></p> +</div> +</div> </div> <div id="autoencoders" class="section level2"> <h2><span class="header-section-number">9.3</span> Autoencoders</h2> +<p><span class="citation">(Kingma and Welling <a href="#ref-kingma2013auto">2013</a>)</span></p> <center> <img src="figures/vae.jpg" style="width:80.0%" /> </center> @@ -936,14 +1025,35 @@ it becomes small if <span class="math inline">\(Q\)</span> is high-capacity. </li> <li>RHS: what we can maximize through stochastic gradient descent.</li> </ul> + </div> </div> -<div id="interpretable-latent-spaces" class="section level2"> -<h2><span class="header-section-number">9.4</span> Interpretable latent spaces</h2> -<div id="slalom" class="section level3"> -<h3><span class="header-section-number">9.4.1</span> Slalom</h3> - </div> +<h3> References</h3> +<div id="refs" class="references"> +<div id="ref-buettner2017f"> +<p>Buettner, Florian, Naruemon Pratanwanich, Davis J McCarthy, John C Marioni, and Oliver Stegle. 2017. “F-scLVM: Scalable and Versatile Factor Analysis for Single-Cell Rna-Seq.†<em>Genome Biology</em> 18 (1). BioMed Central: 212.</p> +</div> +<div id="ref-collins2002generalization"> +<p>Collins, Michael, Sanjoy Dasgupta, and Robert E Schapire. 2002. “A Generalization of Principal Components Analysis to the Exponential Family.†In <em>Advances in Neural Information Processing Systems</em>, 617–24.</p> +</div> +<div id="ref-hinton2003stochastic"> +<p>Hinton, Geoffrey E, and Sam T Roweis. 2003. “Stochastic Neighbor Embedding.†In <em>Advances in Neural Information Processing Systems</em>, 857–64.</p> +</div> +<div id="ref-kingma2013auto"> +<p>Kingma, Diederik P, and Max Welling. 2013. “Auto-Encoding Variational Bayes.†<em>arXiv Preprint arXiv:1312.6114</em>.</p> +</div> +<div id="ref-maaten2008visualizing"> +<p>Maaten, Laurens van der, and Geoffrey Hinton. 2008. “Visualizing Data Using T-Sne.†<em>Journal of Machine Learning Research</em> 9 (Nov): 2579–2605.</p> +</div> +<div id="ref-mcinnes2018umap"> +<p>McInnes, Leland, John Healy, and James Melville. 2018. “Umap: Uniform Manifold Approximation and Projection for Dimension Reduction.†<em>arXiv Preprint arXiv:1802.03426</em>.</p> +</div> +<div id="ref-moon2017phate"> +<p>Moon, Kevin R, David van Dijk, Zheng Wang, William Chen, Matthew J Hirn, Ronald R Coifman, Natalia B Ivanova, Guy Wolf, and Smita Krishnaswamy. 2017. “PHATE: A Dimensionality Reduction Method for Visualizing Trajectory Structures in High-Dimensional Biological Data.†<em>bioRxiv</em>. Cold Spring Harbor Laboratory, 120378.</p> +</div> +<div id="ref-townes2019feature"> +<p>Townes, F William, Stephanie C Hicks, Martin J Aryee, and Rafael A Irizarry. 2019. “Feature Selection and Dimension Reduction for Single Cell Rna-Seq Based on a Multinomial Model.†<em>bioRxiv</em>. Cold Spring Harbor Laboratory, 574574.</p> </div> </div> </section> diff --git a/public/latent-spaces.md b/public/latent-spaces.md index 62260c4364049354e0f14baae7f069a69ea85249..f9986748f36a404aaf2d019c015314387bb09a4b 100644 --- a/public/latent-spaces.md +++ b/public/latent-spaces.md @@ -166,6 +166,7 @@ library(SingleCellExperiment) library(glmpca) library(ggplot2) library(Polychrome) +library(slalom) ``` # Latent spaces @@ -276,7 +277,9 @@ plotPCA(deng, colour_by = "cell_type2") + non-linear dependencies. For instance, PCA would not be able to “unroll†the following structure.\ <center> {width=30%} </center> -#### GLM-PCA +#### [GLM-PCA](https://rdrr.io/cran/glmpca/) +[@collins2002generalization] +[@townes2019feature] GLM-PCA is a generalized version of the traditional PCA. @@ -317,7 +320,7 @@ system.time(res1 <- glmpca(Y, L=2, fam="poi", verbose=TRUE)) ``` ## user system elapsed -## 82.313 22.987 105.317 +## 94.261 25.207 119.499 ``` ```r @@ -339,38 +342,15 @@ ggplot(pd, aes(x = dim1, y = dim2, colour = clust)) + Let us compare GLM-PCA and standard PCA (using normalized log-counts data) on the Tung data, before cells have been QC'd. - -``` -## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its -## own size factors -``` - -``` -## Warning in .get_all_sf_sets(object): spike-in set 'MT' should have its own -## size factors -``` - <img src="latent-spaces_files/figure-html/unnamed-chunk-2-1.png" width="672" style="display: block; margin: auto;" /> Repeat these plots with the QC'd Tung data. - -``` -## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its -## own size factors -``` - -``` -## Warning in .get_all_sf_sets(object): spike-in set 'MT' should have its own -## size factors -``` - <img src="latent-spaces_files/figure-html/unnamed-chunk-3-1.png" width="672" style="display: block; margin: auto;" /> ### tSNE: t-Distributed Stochastic Neighbor Embedding -t-SNE is an advanced version of the original SNE algorithm. <font color="red"> -[ref] </font> +t-SNE [@maaten2008visualizing] is an advanced version of the original SNE algorithm. [@hinton2003stochastic] #### Motivation @@ -443,8 +423,10 @@ other - what happens when we embed in 1D? - __Solution:__\ -Change the distribution of the low-dimensional data $Q$ into a student-t distribution. \ +Change the distribution of the low-dimensional data $Q$ into a student-t distribution. + <center>{width=50%} </center> + Recall that SNE is trying to minimize the dissimilarity of $P$ and $Q$, and $P$ has a Gaussian distribution. So for a pair of points ($x_i$ and $x_j$ in high-dimension, $y_i$ and $y_j$ in low-dimension) to reach the same probability, the distance between $y_i$ and $y_j$ would be much larger (i.e. much farther apart). @@ -474,7 +456,7 @@ Therefore can merely be used for visualization.\ ### Manifold methods -#### UMAP: Uniform Manifold Approximation and Projection +#### UMAP: Uniform Manifold Approximation and Projection [@mcinnes2018umap] ##### __Advantages of UMAP over t-SNE:__ @@ -524,7 +506,7 @@ plotUMAP(muraro, colour_by="cell_type1") -#### PHATE +#### PHATE [@moon2017phate] ##### Sketch of algorithm @@ -576,36 +558,153 @@ $D^t(x_i, x_j) = \Vert \log(p_{x_i}^t) - \log(p_{x_j}^t) \Vert^2$ ##### Example of PHATE - - - +```r +library(phateR) +deng_phate <- phate(t(assay(deng, "logcounts"))) +dt <- data.frame(deng_phate$embedding, clust = deng$cell_type1) +palette(rainbow(10)) +ggplot(dt, aes(x=PHATE1, y=PHATE2, color=clust)) + + geom_point() +``` +<center>  </center> +## Matrix factorization and factor analysis +__The key concept of factor analysis__: The original, observed variables are +correlated because they are all associated with some unobservable variables, +the __latent factors__. +It looks similar to PCA, but instead of dimensionality reduction, factor analysis +focuses on studying the latent factors. +The variance of an observed variable can be splitted into two parts: \ +- Common variance: the part of variance that is explained by latent factors; \ +- Unique variance: the part that is specific to only one variable, usually considered as an error component or __residual__. -## Matrix factorization and factor analysis +The __factor loadings__ or weights indicate how much each latent factor is affecting the observed features. -Factor Analysis is similar to PCA in that, -they both aim to obtain a new set of distinct summary variables, -which are fewer in number than the original number of variables. +<center> {width=60%} </center> -The key concept of factor analysis is that the original, observed variables are -correlated because they are all associated with some unobservable variables, -called latent factors. +### [Slalom](https://bioconductor.org/packages/release/bioc/html/slalom.html): Interpretable latent spaces -The variance of a variable can be splitted into two parts: \ -- Common variance: the part of variance that is explained by latent factors; \ -- Unique variance: the part that is specific to only one variable, usually considered as an error component or residual. +Highlight of Slalom: [@buettner2017f] + +- It incorporates prior information to help the model estimation; + +- It learns whatever not provided by prior knowledge in the model training process; + +- It enforces sparsity in the weight matrix. + +#### Methodology + +__Matrix expression of factor analysis:__ -<center> {width=80%} </center> +<center>{width=80%} </center> +__How prior knowledge affects the model:__ +<center> </center> + +- $I_{g, k}$: (observed) Indicator of whether a gene $g$ is annotated to a given pathway or factor $k$;\ +- $z_{g, k}$: (latent) Indicator of whether factor $k$ has a regulatory effect on gene $g$;\ +- $w_{g, k}$: (estimated) weights. + +__grey arrow__: +$$ P(I_{g, k}\vert z_{g, k}) = \begin{cases} +\text{Bernoulli}(p_1), \text{if } z_{g, k} = 1\\ +\text{Bernoulli}(p_2), \text{if } z_{g, k} = 0\\ +\end{cases}$$ + +__green arrow__: +$$ P(w_{g, k}\vert z_{g, k}) = \begin{cases} +N(w_{g, k}, 1/\alpha), \text{ if } z_{g, k} = 1\\ +\delta_0(w_{g, k}), \text{ if } z_{g, k} = 0\\ +\end{cases}$$ + +<center></center> + +We only look at the part of the __likelihood__ that is relavant to this part: +$\prod_{g} \prod_{k}P(I_{g, k}, w_{g, k}, z_{g, k})$, \ +where $P(I_{g, k}, w_{g, k}, z_{g, k}) = P(I_{g, k}, w_{g, k}| z_{g, k})P(z_{g,k}) += P( I_{g, k}| z_{g, k})P( w_{g, k}| z_{g, k})P(z_{g,k})$. +Since we do not know anything about $z_{g,k}$, it is assumed as Bernoulli(1/2). + +#### Example +First, get a geneset in a `GeneSetCollection` object. + +```r +gmtfile <- system.file("extdata", "reactome_subset.gmt", package = "slalom") +genesets <- GSEABase::getGmt(gmtfile) +``` + +Then we create an `Rcpp_SlalomModel` object containing the input data and genesets (and subsequent results) for the model. + +```r +model_deng <- newSlalomModel(deng, genesets, n_hidden = 5, min_genes = 10) +``` + +``` +## 29 annotated factors retained; 1 annotated factors dropped. +## 1072 genes retained for analysis. +``` + +Initialize the model: + +```r +model_deng <- initSlalom(model_deng, seed = 100) +``` + +Fit/train the model: + +```r +model_deng <- trainSlalom(model_deng, nIterations = 1000, seed = 100, tolerance = 0.001) +``` + +``` +## pre-training model for faster convergence +## iteration 0 +## Model not converged after 50 iterations. +## iteration 0 +## Model not converged after 50 iterations. +## iteration 0 +## Switched off factor 29 +## Switched off factor 20 +## Switched off factor 32 +## Switched off factor 28 +## Switched off factor 13 +## Switched off factor 27 +## Switched off factor 10 +## iteration 100 +## Switched off factor 22 +## iteration 200 +## iteration 300 +## iteration 400 +## iteration 500 +## iteration 600 +## iteration 700 +## Model converged after 701 iterations. +``` + +View results:\ +The `plotRelevance` function displays the most relevant terms (factors/pathways) ranked by relevance, showing gene set size and the number of genes gained/lost as active in the pathway as learnt by the model. + +```r +plotRelevance(model_deng) +``` + +<img src="latent-spaces_files/figure-html/unnamed-chunk-8-1.png" width="960" style="display: block; margin: auto;" /> +The `plotTerms` function shows the relevance of all terms in the model, enabling the identification of the most important pathways in the context of all that were included in the model. + +```r +plotTerms(model_deng) +``` +<img src="latent-spaces_files/figure-html/unnamed-chunk-9-1.png" width="672" style="display: block; margin: auto;" /> -## Autoencoders +## Autoencoders +[@kingma2013auto] <center>{width=80%}</center> @@ -643,9 +742,7 @@ $$ \log P(X) - KL[Q(Z|X)\Vert P(Z|X)] = E_{Z\sim Q}[\log P(X|Z)] - KL[Q(Z|X)\Ver -## Interpretable latent spaces -### Slalom diff --git a/public/latent-spaces_files/figure-html/glmpca-1.png b/public/latent-spaces_files/figure-html/glmpca-1.png index 2559ee65458e08b708bf7b30b63f6fc6b7ee3287..874a0cfbf9cc88ba2e143a78a404102ffad6c806 100644 Binary files a/public/latent-spaces_files/figure-html/glmpca-1.png and b/public/latent-spaces_files/figure-html/glmpca-1.png differ diff --git a/public/latent-spaces_files/figure-html/tsne-1.png b/public/latent-spaces_files/figure-html/tsne-1.png index 38a553811bb1120a7467ed4649aad0f2d92eda72..6d247eebea219dda2b46a9fcf96159f4b0860a57 100644 Binary files a/public/latent-spaces_files/figure-html/tsne-1.png and b/public/latent-spaces_files/figure-html/tsne-1.png differ diff --git a/public/latent-spaces_files/figure-html/tsne-2.png b/public/latent-spaces_files/figure-html/tsne-2.png index 393e6cb5ff8e60c4e86f942d583078d1103972c9..0368cb3e53e29005cce1167338c23b168913f573 100644 Binary files a/public/latent-spaces_files/figure-html/tsne-2.png and b/public/latent-spaces_files/figure-html/tsne-2.png differ diff --git a/public/latent-spaces_files/figure-html/umap-1.png b/public/latent-spaces_files/figure-html/umap-1.png index 16593c7bfd92b2f6132f66d2797c71c0149d1a28..49f066eef071d6623ed20d4052a0fc802b700720 100644 Binary files a/public/latent-spaces_files/figure-html/umap-1.png and b/public/latent-spaces_files/figure-html/umap-1.png differ diff --git a/public/latent-spaces_files/figure-html/unnamed-chunk-1-1.png b/public/latent-spaces_files/figure-html/unnamed-chunk-1-1.png index accd006586589edcfe6a8b365cf6c663c2cc01cd..e814b400c7f69c9308365e15183c86a03cbc64bc 100644 Binary files a/public/latent-spaces_files/figure-html/unnamed-chunk-1-1.png and b/public/latent-spaces_files/figure-html/unnamed-chunk-1-1.png differ diff --git a/public/latent-spaces_files/figure-html/unnamed-chunk-1-2.png b/public/latent-spaces_files/figure-html/unnamed-chunk-1-2.png index be7163e72410f10de19dd20ef45b93fbc6f0a0ba..ac94b0d3b2fa1bec0b6dba3d37f961e02a117b71 100644 Binary files a/public/latent-spaces_files/figure-html/unnamed-chunk-1-2.png and b/public/latent-spaces_files/figure-html/unnamed-chunk-1-2.png differ diff --git a/public/latent-spaces_files/figure-html/unnamed-chunk-2-1.png b/public/latent-spaces_files/figure-html/unnamed-chunk-2-1.png index 9750536527288fa498cf85d0a5a3d66c6f713053..4161699e12b61839aa8e09d5a416550aa8716667 100644 Binary files a/public/latent-spaces_files/figure-html/unnamed-chunk-2-1.png and b/public/latent-spaces_files/figure-html/unnamed-chunk-2-1.png differ diff --git a/public/latent-spaces_files/figure-html/unnamed-chunk-3-1.png b/public/latent-spaces_files/figure-html/unnamed-chunk-3-1.png index e2cbd97d2f5be6b7ca31d51b5e508ca7f38dcf65..ccc56ffae3e9d519904c6045dd7cf655d3699797 100644 Binary files a/public/latent-spaces_files/figure-html/unnamed-chunk-3-1.png and b/public/latent-spaces_files/figure-html/unnamed-chunk-3-1.png differ diff --git a/public/latent-spaces_files/figure-html/unnamed-chunk-8-1.png b/public/latent-spaces_files/figure-html/unnamed-chunk-8-1.png new file mode 100644 index 0000000000000000000000000000000000000000..2f1809e5c12b1652d2a9ff7934c27388561129b4 Binary files /dev/null and b/public/latent-spaces_files/figure-html/unnamed-chunk-8-1.png differ diff --git a/public/latent-spaces_files/figure-html/unnamed-chunk-9-1.png b/public/latent-spaces_files/figure-html/unnamed-chunk-9-1.png new file mode 100644 index 0000000000000000000000000000000000000000..8b6f8616e422aec39efbdba2a5e44061c4bb8f29 Binary files /dev/null and b/public/latent-spaces_files/figure-html/unnamed-chunk-9-1.png differ diff --git a/public/normalization-confounders-and-batch-correction.html b/public/normalization-confounders-and-batch-correction.html index f7e2857fce65bef62ffd7d78dc420931ef9ac499..c2f7723e64bdb43e9651b217bdfecb9203be1b0a 100644 --- a/public/normalization-confounders-and-batch-correction.html +++ b/public/normalization-confounders-and-batch-correction.html @@ -339,7 +339,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.4.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-3"><i class="fa fa-check"></i><b>7.4.6</b> sessionInfo()</a></li> </ul></li> <li class="chapter" data-level="7.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#identifying-confounding-factors-reads"><i class="fa fa-check"></i><b>7.5</b> Identifying confounding factors (Reads)</a></li> -<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#dealing-with-confounders"><i class="fa fa-check"></i><b>7.6</b> Dealing with confounders</a><ul> +<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#batch-effects"><i class="fa fa-check"></i><b>7.6</b> Batch effects</a><ul> <li class="chapter" data-level="7.6.1" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#introduction-6"><i class="fa fa-check"></i><b>7.6.1</b> Introduction</a></li> <li class="chapter" data-level="7.6.2" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#linear-models"><i class="fa fa-check"></i><b>7.6.2</b> Linear models</a></li> <li class="chapter" data-level="7.6.3" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sctransform-2"><i class="fa fa-check"></i><b>7.6.3</b> sctransform</a></li> @@ -347,7 +347,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.6.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#combat"><i class="fa fa-check"></i><b>7.6.5</b> Combat</a></li> <li class="chapter" data-level="7.6.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#mnncorrect"><i class="fa fa-check"></i><b>7.6.6</b> mnnCorrect</a></li> <li class="chapter" data-level="7.6.7" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#harmony"><i class="fa fa-check"></i><b>7.6.7</b> Harmony</a></li> -<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-confounder-removal-strategies"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare confounder removal strategies</a></li> +<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-batch-correction"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare batch correction</a></li> <li class="chapter" data-level="7.6.9" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#big-exercise-2"><i class="fa fa-check"></i><b>7.6.9</b> Big Exercise</a></li> <li class="chapter" data-level="7.6.10" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-4"><i class="fa fa-check"></i><b>7.6.10</b> sessionInfo()</a></li> </ul></li> @@ -370,14 +370,13 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="9.1.2" data-path="latent-spaces.html"><a href="latent-spaces.html#tsne-t-distributed-stochastic-neighbor-embedding"><i class="fa fa-check"></i><b>9.1.2</b> tSNE: t-Distributed Stochastic Neighbor Embedding</a></li> <li class="chapter" data-level="9.1.3" data-path="latent-spaces.html"><a href="latent-spaces.html#manifold-methods"><i class="fa fa-check"></i><b>9.1.3</b> Manifold methods</a></li> </ul></li> -<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a></li> +<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a><ul> +<li class="chapter" data-level="9.2.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom-interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.2.1</b> <span>Slalom</span>: Interpretable latent spaces</a></li> +</ul></li> <li class="chapter" data-level="9.3" data-path="latent-spaces.html"><a href="latent-spaces.html#autoencoders"><i class="fa fa-check"></i><b>9.3</b> Autoencoders</a><ul> <li class="chapter" data-level="9.3.1" data-path="latent-spaces.html"><a href="latent-spaces.html#background-and-some-notations"><i class="fa fa-check"></i><b>9.3.1</b> Background and some notations</a></li> <li class="chapter" data-level="9.3.2" data-path="latent-spaces.html"><a href="latent-spaces.html#objective"><i class="fa fa-check"></i><b>9.3.2</b> Objective</a></li> </ul></li> -<li class="chapter" data-level="9.4" data-path="latent-spaces.html"><a href="latent-spaces.html#interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.4</b> Interpretable latent spaces</a><ul> -<li class="chapter" data-level="9.4.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom"><i class="fa fa-check"></i><b>9.4.1</b> Slalom</a></li> -</ul></li> </ul></li> <li class="chapter" data-level="10" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html"><i class="fa fa-check"></i><b>10</b> Clustering and cell annotation</a><ul> <li class="chapter" data-level="10.1" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html#clustering-methods"><i class="fa fa-check"></i><b>10.1</b> Clustering Methods</a><ul> @@ -399,15 +398,16 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="11.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#first-look-at-deng-data"><i class="fa fa-check"></i><b>11.1</b> First look at Deng data</a><ul> <li class="chapter" data-level="11.1.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#tscan"><i class="fa fa-check"></i><b>11.1.1</b> TSCAN</a></li> <li class="chapter" data-level="11.1.2" data-path="trajectory-inference.html"><a href="trajectory-inference.html#slingshot"><i class="fa fa-check"></i><b>11.1.2</b> Slingshot</a></li> -<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.3</b> Monocle</a></li> -<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.4</b> Monocle 2</a></li> -<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 3</a></li> -<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.6</b> Diffusion maps</a></li> -<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.7</b> Other methods</a></li> -<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.8</b> Comparison of the methods</a></li> -<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.9</b> Expression of genes through time</a></li> -<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.10</b> dynverse</a></li> -<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.11</b> sessionInfo()</a></li> +<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#gam-general-additive-model-for-identifying-temporally-expressed-genes"><i class="fa fa-check"></i><b>11.1.3</b> GAM general additive model for identifying temporally expressed genes</a></li> +<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.4</b> Monocle</a></li> +<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 2</a></li> +<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.6</b> Monocle 3</a></li> +<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.7</b> Diffusion maps</a></li> +<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.8</b> Other methods</a></li> +<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.9</b> Comparison of the methods</a></li> +<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.10</b> Expression of genes through time</a></li> +<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.11</b> dynverse</a></li> +<li class="chapter" data-level="11.1.12" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.12</b> sessionInfo()</a></li> </ul></li> </ul></li> <li class="chapter" data-level="12" data-path="dechapter.html"><a href="dechapter.html"><i class="fa fa-check"></i><b>12</b> Differential Expression (DE) analysis</a><ul> @@ -614,12 +614,12 @@ Note that spike-ins should be excluded from the calculation of total expression in order to correct for total cell RNA content, therefore we will only use endogenous genes. Example of a <strong>CPM</strong> function in <code>R</code> (using the <code>scater</code> package):</p> -<div class="sourceCode" id="cb297"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb297-1" data-line-number="1">calc_cpm <-</a> -<a class="sourceLine" id="cb297-2" data-line-number="2"><span class="cf">function</span> (expr_mat, <span class="dt">spikes =</span> <span class="ot">NULL</span>) </a> -<a class="sourceLine" id="cb297-3" data-line-number="3">{</a> -<a class="sourceLine" id="cb297-4" data-line-number="4"> norm_factor <-<span class="st"> </span><span class="kw">colSums</span>(expr_mat[<span class="op">-</span>spikes, ])</a> -<a class="sourceLine" id="cb297-5" data-line-number="5"> <span class="kw">return</span>(<span class="kw">t</span>(<span class="kw">t</span>(expr_mat)<span class="op">/</span>norm_factor)) <span class="op">*</span><span class="st"> </span><span class="dv">10</span><span class="op">^</span><span class="dv">6</span></a> -<a class="sourceLine" id="cb297-6" data-line-number="6">}</a></code></pre></div> +<div class="sourceCode" id="cb306"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb306-1" data-line-number="1">calc_cpm <-</a> +<a class="sourceLine" id="cb306-2" data-line-number="2"><span class="cf">function</span> (expr_mat, <span class="dt">spikes =</span> <span class="ot">NULL</span>) </a> +<a class="sourceLine" id="cb306-3" data-line-number="3">{</a> +<a class="sourceLine" id="cb306-4" data-line-number="4"> norm_factor <-<span class="st"> </span><span class="kw">colSums</span>(expr_mat[<span class="op">-</span>spikes, ])</a> +<a class="sourceLine" id="cb306-5" data-line-number="5"> <span class="kw">return</span>(<span class="kw">t</span>(<span class="kw">t</span>(expr_mat)<span class="op">/</span>norm_factor)) <span class="op">*</span><span class="st"> </span><span class="dv">10</span><span class="op">^</span><span class="dv">6</span></a> +<a class="sourceLine" id="cb306-6" data-line-number="6">}</a></code></pre></div> <p>One potential drawback of <strong>CPM</strong> is if your sample contains genes that are both very highly expressed and differentially expressed across the cells. In this case, the total molecules in the cell may depend of whether such genes are @@ -644,17 +644,17 @@ scRNASeq experiments. <code>edgeR</code> & <code>scater</code> call this met log expression†(to distinguish it from the many other size-factor normalization methods that now exist). Example of a <strong>SF</strong> function in <code>R</code> (from the <code>edgeR</code> package):</p> -<div class="sourceCode" id="cb298"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb298-1" data-line-number="1">calc_sf <-</a> -<a class="sourceLine" id="cb298-2" data-line-number="2"><span class="cf">function</span> (expr_mat, <span class="dt">spikes =</span> <span class="ot">NULL</span>) </a> -<a class="sourceLine" id="cb298-3" data-line-number="3">{</a> -<a class="sourceLine" id="cb298-4" data-line-number="4"> geomeans <-<span class="st"> </span><span class="kw">exp</span>(<span class="kw">rowMeans</span>(<span class="kw">log</span>(expr_mat[<span class="op">-</span>spikes, ])))</a> -<a class="sourceLine" id="cb298-5" data-line-number="5"> SF <-<span class="st"> </span><span class="cf">function</span>(cnts) {</a> -<a class="sourceLine" id="cb298-6" data-line-number="6"> <span class="kw">median</span>((cnts<span class="op">/</span>geomeans)[(<span class="kw">is.finite</span>(geomeans) <span class="op">&</span><span class="st"> </span>geomeans <span class="op">></span><span class="st"> </span></a> -<a class="sourceLine" id="cb298-7" data-line-number="7"><span class="st"> </span><span class="dv">0</span>)])</a> -<a class="sourceLine" id="cb298-8" data-line-number="8"> }</a> -<a class="sourceLine" id="cb298-9" data-line-number="9"> norm_factor <-<span class="st"> </span><span class="kw">apply</span>(expr_mat[<span class="op">-</span>spikes, ], <span class="dv">2</span>, SF)</a> -<a class="sourceLine" id="cb298-10" data-line-number="10"> <span class="kw">return</span>(<span class="kw">t</span>(<span class="kw">t</span>(expr_mat)<span class="op">/</span>norm_factor))</a> -<a class="sourceLine" id="cb298-11" data-line-number="11">}</a></code></pre></div> +<div class="sourceCode" id="cb307"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb307-1" data-line-number="1">calc_sf <-</a> +<a class="sourceLine" id="cb307-2" data-line-number="2"><span class="cf">function</span> (expr_mat, <span class="dt">spikes =</span> <span class="ot">NULL</span>) </a> +<a class="sourceLine" id="cb307-3" data-line-number="3">{</a> +<a class="sourceLine" id="cb307-4" data-line-number="4"> geomeans <-<span class="st"> </span><span class="kw">exp</span>(<span class="kw">rowMeans</span>(<span class="kw">log</span>(expr_mat[<span class="op">-</span>spikes, ])))</a> +<a class="sourceLine" id="cb307-5" data-line-number="5"> SF <-<span class="st"> </span><span class="cf">function</span>(cnts) {</a> +<a class="sourceLine" id="cb307-6" data-line-number="6"> <span class="kw">median</span>((cnts<span class="op">/</span>geomeans)[(<span class="kw">is.finite</span>(geomeans) <span class="op">&</span><span class="st"> </span>geomeans <span class="op">></span><span class="st"> </span></a> +<a class="sourceLine" id="cb307-7" data-line-number="7"><span class="st"> </span><span class="dv">0</span>)])</a> +<a class="sourceLine" id="cb307-8" data-line-number="8"> }</a> +<a class="sourceLine" id="cb307-9" data-line-number="9"> norm_factor <-<span class="st"> </span><span class="kw">apply</span>(expr_mat[<span class="op">-</span>spikes, ], <span class="dv">2</span>, SF)</a> +<a class="sourceLine" id="cb307-10" data-line-number="10"> <span class="kw">return</span>(<span class="kw">t</span>(<span class="kw">t</span>(expr_mat)<span class="op">/</span>norm_factor))</a> +<a class="sourceLine" id="cb307-11" data-line-number="11">}</a></code></pre></div> </div> <div id="uq" class="section level4"> <h4><span class="header-section-number">7.1.3.3</span> UQ</h4> @@ -667,16 +667,16 @@ in the 75% quantile being zero (or close to it). This limitation can be overcome by generalizing the idea and using a higher quantile (eg. the 99% quantile is the default in scater) or by excluding zeros prior to calculating the 75% quantile. Example of a <strong>UQ</strong> function in <code>R</code> (again from the <code>edgeR</code> package):</p> -<div class="sourceCode" id="cb299"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb299-1" data-line-number="1">calc_uq <-</a> -<a class="sourceLine" id="cb299-2" data-line-number="2"><span class="cf">function</span> (expr_mat, <span class="dt">spikes =</span> <span class="ot">NULL</span>) </a> -<a class="sourceLine" id="cb299-3" data-line-number="3">{</a> -<a class="sourceLine" id="cb299-4" data-line-number="4"> UQ <-<span class="st"> </span><span class="cf">function</span>(x) {</a> -<a class="sourceLine" id="cb299-5" data-line-number="5"> <span class="kw">quantile</span>(x[x <span class="op">></span><span class="st"> </span><span class="dv">0</span>], <span class="fl">0.75</span>)</a> -<a class="sourceLine" id="cb299-6" data-line-number="6"> }</a> -<a class="sourceLine" id="cb299-7" data-line-number="7"> uq <-<span class="st"> </span><span class="kw">unlist</span>(<span class="kw">apply</span>(expr_mat[<span class="op">-</span>spikes, ], <span class="dv">2</span>, UQ))</a> -<a class="sourceLine" id="cb299-8" data-line-number="8"> norm_factor <-<span class="st"> </span>uq<span class="op">/</span><span class="kw">median</span>(uq)</a> -<a class="sourceLine" id="cb299-9" data-line-number="9"> <span class="kw">return</span>(<span class="kw">t</span>(<span class="kw">t</span>(expr_mat)<span class="op">/</span>norm_factor))</a> -<a class="sourceLine" id="cb299-10" data-line-number="10">}</a></code></pre></div> +<div class="sourceCode" id="cb308"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb308-1" data-line-number="1">calc_uq <-</a> +<a class="sourceLine" id="cb308-2" data-line-number="2"><span class="cf">function</span> (expr_mat, <span class="dt">spikes =</span> <span class="ot">NULL</span>) </a> +<a class="sourceLine" id="cb308-3" data-line-number="3">{</a> +<a class="sourceLine" id="cb308-4" data-line-number="4"> UQ <-<span class="st"> </span><span class="cf">function</span>(x) {</a> +<a class="sourceLine" id="cb308-5" data-line-number="5"> <span class="kw">quantile</span>(x[x <span class="op">></span><span class="st"> </span><span class="dv">0</span>], <span class="fl">0.75</span>)</a> +<a class="sourceLine" id="cb308-6" data-line-number="6"> }</a> +<a class="sourceLine" id="cb308-7" data-line-number="7"> uq <-<span class="st"> </span><span class="kw">unlist</span>(<span class="kw">apply</span>(expr_mat[<span class="op">-</span>spikes, ], <span class="dv">2</span>, UQ))</a> +<a class="sourceLine" id="cb308-8" data-line-number="8"> norm_factor <-<span class="st"> </span>uq<span class="op">/</span><span class="kw">median</span>(uq)</a> +<a class="sourceLine" id="cb308-9" data-line-number="9"> <span class="kw">return</span>(<span class="kw">t</span>(<span class="kw">t</span>(expr_mat)<span class="op">/</span>norm_factor))</a> +<a class="sourceLine" id="cb308-10" data-line-number="10">}</a></code></pre></div> </div> <div id="tmm" class="section level4"> <h4><span class="header-section-number">7.1.3.4</span> TMM</h4> @@ -690,7 +690,7 @@ account for the effect of the log scale on variance. Each non-reference cell is multiplied by the calculated factor. Two potential issues with this method are insufficient non-zero genes left after trimming, and the assumption that most genes are not differentially expressed.</p> -<div class="sourceCode" id="cb300"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb300-1" data-line-number="1"><span class="kw">sizeFactors</span>(umi.qc) <-<span class="st"> </span>edgeR<span class="op">::</span><span class="kw">calcNormFactors</span>(<span class="kw">counts</span>(umi.qc), <span class="dt">method =</span> <span class="st">"TMM"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb309"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb309-1" data-line-number="1"><span class="kw">sizeFactors</span>(umi.qc) <-<span class="st"> </span>edgeR<span class="op">::</span><span class="kw">calcNormFactors</span>(<span class="kw">counts</span>(umi.qc), <span class="dt">method =</span> <span class="st">"TMM"</span>)</a></code></pre></div> </div> <div id="scran" class="section level4"> <h4><span class="header-section-number">7.1.3.5</span> scran</h4> @@ -703,8 +703,8 @@ factors can be deconvoluted from the collection of pool-specific factors using linear algebra.</p> <p>This method applies a “quick cluster†method to get rough clusters of cells to pool together to apply the strategy outlined above.</p> -<div class="sourceCode" id="cb301"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb301-1" data-line-number="1">qclust <-<span class="st"> </span><span class="kw">quickCluster</span>(umi.qc, <span class="dt">min.size =</span> <span class="dv">30</span>)</a> -<a class="sourceLine" id="cb301-2" data-line-number="2">umi.qc <-<span class="st"> </span><span class="kw">computeSumFactors</span>(umi.qc, <span class="dt">sizes =</span> <span class="dv">15</span>, <span class="dt">clusters =</span> qclust)</a></code></pre></div> +<div class="sourceCode" id="cb310"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb310-1" data-line-number="1">qclust <-<span class="st"> </span><span class="kw">quickCluster</span>(umi.qc, <span class="dt">min.size =</span> <span class="dv">30</span>)</a> +<a class="sourceLine" id="cb310-2" data-line-number="2">umi.qc <-<span class="st"> </span><span class="kw">computeSumFactors</span>(umi.qc, <span class="dt">sizes =</span> <span class="dv">15</span>, <span class="dt">clusters =</span> qclust)</a></code></pre></div> </div> </div> <div id="sctransform" class="section level3"> @@ -787,26 +787,26 @@ via <code>scater</code>’s <code>plotRLE()</code> function. Namely, cells with higher (lower) than median expression for most genes resulting in a positive (negative) <em>RLE</em> across the cell, whereas normalized cells have an <em>RLE</em> close to zero. Example of a <em>RLE</em> function in <code>R</code>:</p> -<div class="sourceCode" id="cb302"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb302-1" data-line-number="1">calc_cell_RLE <-</a> -<a class="sourceLine" id="cb302-2" data-line-number="2"><span class="cf">function</span> (expr_mat, <span class="dt">spikes =</span> <span class="ot">NULL</span>) </a> -<a class="sourceLine" id="cb302-3" data-line-number="3">{</a> -<a class="sourceLine" id="cb302-4" data-line-number="4"> RLE_gene <-<span class="st"> </span><span class="cf">function</span>(x) {</a> -<a class="sourceLine" id="cb302-5" data-line-number="5"> <span class="cf">if</span> (<span class="kw">median</span>(<span class="kw">unlist</span>(x)) <span class="op">></span><span class="st"> </span><span class="dv">0</span>) {</a> -<a class="sourceLine" id="cb302-6" data-line-number="6"> <span class="kw">log</span>((x <span class="op">+</span><span class="st"> </span><span class="dv">1</span>)<span class="op">/</span>(<span class="kw">median</span>(<span class="kw">unlist</span>(x)) <span class="op">+</span><span class="st"> </span><span class="dv">1</span>))<span class="op">/</span><span class="kw">log</span>(<span class="dv">2</span>)</a> -<a class="sourceLine" id="cb302-7" data-line-number="7"> }</a> -<a class="sourceLine" id="cb302-8" data-line-number="8"> <span class="cf">else</span> {</a> -<a class="sourceLine" id="cb302-9" data-line-number="9"> <span class="kw">rep</span>(<span class="ot">NA</span>, <span class="dt">times =</span> <span class="kw">length</span>(x))</a> -<a class="sourceLine" id="cb302-10" data-line-number="10"> }</a> -<a class="sourceLine" id="cb302-11" data-line-number="11"> }</a> -<a class="sourceLine" id="cb302-12" data-line-number="12"> <span class="cf">if</span> (<span class="op">!</span><span class="kw">is.null</span>(spikes)) {</a> -<a class="sourceLine" id="cb302-13" data-line-number="13"> RLE_matrix <-<span class="st"> </span><span class="kw">t</span>(<span class="kw">apply</span>(expr_mat[<span class="op">-</span>spikes, ], <span class="dv">1</span>, RLE_gene))</a> -<a class="sourceLine" id="cb302-14" data-line-number="14"> }</a> -<a class="sourceLine" id="cb302-15" data-line-number="15"> <span class="cf">else</span> {</a> -<a class="sourceLine" id="cb302-16" data-line-number="16"> RLE_matrix <-<span class="st"> </span><span class="kw">t</span>(<span class="kw">apply</span>(expr_mat, <span class="dv">1</span>, RLE_gene))</a> -<a class="sourceLine" id="cb302-17" data-line-number="17"> }</a> -<a class="sourceLine" id="cb302-18" data-line-number="18"> cell_RLE <-<span class="st"> </span><span class="kw">apply</span>(RLE_matrix, <span class="dv">2</span>, median, <span class="dt">na.rm =</span> T)</a> -<a class="sourceLine" id="cb302-19" data-line-number="19"> <span class="kw">return</span>(cell_RLE)</a> -<a class="sourceLine" id="cb302-20" data-line-number="20">}</a></code></pre></div> +<div class="sourceCode" id="cb311"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb311-1" data-line-number="1">calc_cell_RLE <-</a> +<a class="sourceLine" id="cb311-2" data-line-number="2"><span class="cf">function</span> (expr_mat, <span class="dt">spikes =</span> <span class="ot">NULL</span>) </a> +<a class="sourceLine" id="cb311-3" data-line-number="3">{</a> +<a class="sourceLine" id="cb311-4" data-line-number="4"> RLE_gene <-<span class="st"> </span><span class="cf">function</span>(x) {</a> +<a class="sourceLine" id="cb311-5" data-line-number="5"> <span class="cf">if</span> (<span class="kw">median</span>(<span class="kw">unlist</span>(x)) <span class="op">></span><span class="st"> </span><span class="dv">0</span>) {</a> +<a class="sourceLine" id="cb311-6" data-line-number="6"> <span class="kw">log</span>((x <span class="op">+</span><span class="st"> </span><span class="dv">1</span>)<span class="op">/</span>(<span class="kw">median</span>(<span class="kw">unlist</span>(x)) <span class="op">+</span><span class="st"> </span><span class="dv">1</span>))<span class="op">/</span><span class="kw">log</span>(<span class="dv">2</span>)</a> +<a class="sourceLine" id="cb311-7" data-line-number="7"> }</a> +<a class="sourceLine" id="cb311-8" data-line-number="8"> <span class="cf">else</span> {</a> +<a class="sourceLine" id="cb311-9" data-line-number="9"> <span class="kw">rep</span>(<span class="ot">NA</span>, <span class="dt">times =</span> <span class="kw">length</span>(x))</a> +<a class="sourceLine" id="cb311-10" data-line-number="10"> }</a> +<a class="sourceLine" id="cb311-11" data-line-number="11"> }</a> +<a class="sourceLine" id="cb311-12" data-line-number="12"> <span class="cf">if</span> (<span class="op">!</span><span class="kw">is.null</span>(spikes)) {</a> +<a class="sourceLine" id="cb311-13" data-line-number="13"> RLE_matrix <-<span class="st"> </span><span class="kw">t</span>(<span class="kw">apply</span>(expr_mat[<span class="op">-</span>spikes, ], <span class="dv">1</span>, RLE_gene))</a> +<a class="sourceLine" id="cb311-14" data-line-number="14"> }</a> +<a class="sourceLine" id="cb311-15" data-line-number="15"> <span class="cf">else</span> {</a> +<a class="sourceLine" id="cb311-16" data-line-number="16"> RLE_matrix <-<span class="st"> </span><span class="kw">t</span>(<span class="kw">apply</span>(expr_mat, <span class="dv">1</span>, RLE_gene))</a> +<a class="sourceLine" id="cb311-17" data-line-number="17"> }</a> +<a class="sourceLine" id="cb311-18" data-line-number="18"> cell_RLE <-<span class="st"> </span><span class="kw">apply</span>(RLE_matrix, <span class="dv">2</span>, median, <span class="dt">na.rm =</span> T)</a> +<a class="sourceLine" id="cb311-19" data-line-number="19"> <span class="kw">return</span>(cell_RLE)</a> +<a class="sourceLine" id="cb311-20" data-line-number="20">}</a></code></pre></div> <p><strong>Note</strong> The <strong>RLE</strong>, <strong>TMM</strong>, and <strong>UQ</strong> size-factor methods were developed for bulk RNA-seq data and, depending on the experimental context, may not be appropriate for single-cell RNA-seq data, as their underlying assumptions may be @@ -835,26 +835,26 @@ of the <code>SCE</code> object.</p> <h2><span class="header-section-number">7.2</span> Normalization practice (UMI)</h2> <p>We will continue to work with the <code>tung</code> data that was used in the previous chapter.</p> -<div class="sourceCode" id="cb303"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb303-1" data-line-number="1"><span class="kw">library</span>(scRNA.seq.funcs)</a> -<a class="sourceLine" id="cb303-2" data-line-number="2"><span class="kw">library</span>(scater)</a> -<a class="sourceLine" id="cb303-3" data-line-number="3"><span class="kw">library</span>(scran)</a> -<a class="sourceLine" id="cb303-4" data-line-number="4"><span class="kw">options</span>(<span class="dt">stringsAsFactors =</span> <span class="ot">FALSE</span>)</a> -<a class="sourceLine" id="cb303-5" data-line-number="5"><span class="kw">set.seed</span>(<span class="dv">1234567</span>)</a> -<a class="sourceLine" id="cb303-6" data-line-number="6">umi <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/tung/umi.rds"</span>)</a> -<a class="sourceLine" id="cb303-7" data-line-number="7">umi.qc <-<span class="st"> </span>umi[<span class="kw">rowData</span>(umi)<span class="op">$</span>use, <span class="kw">colData</span>(umi)<span class="op">$</span>use]</a> -<a class="sourceLine" id="cb303-8" data-line-number="8">endog_genes <-<span class="st"> </span><span class="op">!</span><span class="kw">rowData</span>(umi.qc)<span class="op">$</span>is_feature_control</a></code></pre></div> +<div class="sourceCode" id="cb312"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb312-1" data-line-number="1"><span class="kw">library</span>(scRNA.seq.funcs)</a> +<a class="sourceLine" id="cb312-2" data-line-number="2"><span class="kw">library</span>(scater)</a> +<a class="sourceLine" id="cb312-3" data-line-number="3"><span class="kw">library</span>(scran)</a> +<a class="sourceLine" id="cb312-4" data-line-number="4"><span class="kw">options</span>(<span class="dt">stringsAsFactors =</span> <span class="ot">FALSE</span>)</a> +<a class="sourceLine" id="cb312-5" data-line-number="5"><span class="kw">set.seed</span>(<span class="dv">1234567</span>)</a> +<a class="sourceLine" id="cb312-6" data-line-number="6">umi <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/tung/umi.rds"</span>)</a> +<a class="sourceLine" id="cb312-7" data-line-number="7">umi.qc <-<span class="st"> </span>umi[<span class="kw">rowData</span>(umi)<span class="op">$</span>use, <span class="kw">colData</span>(umi)<span class="op">$</span>use]</a> +<a class="sourceLine" id="cb312-8" data-line-number="8">endog_genes <-<span class="st"> </span><span class="op">!</span><span class="kw">rowData</span>(umi.qc)<span class="op">$</span>is_feature_control</a></code></pre></div> <div id="raw" class="section level3"> <h3><span class="header-section-number">7.2.1</span> Raw</h3> -<div class="sourceCode" id="cb304"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb304-1" data-line-number="1">tmp <-<span class="st"> </span><span class="kw">runPCA</span>(</a> -<a class="sourceLine" id="cb304-2" data-line-number="2"> umi.qc[endog_genes, ],</a> -<a class="sourceLine" id="cb304-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"logcounts_raw"</span></a> -<a class="sourceLine" id="cb304-4" data-line-number="4">)</a> -<a class="sourceLine" id="cb304-5" data-line-number="5"><span class="kw">plotPCA</span>(</a> -<a class="sourceLine" id="cb304-6" data-line-number="6"> tmp,</a> -<a class="sourceLine" id="cb304-7" data-line-number="7"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> -<a class="sourceLine" id="cb304-8" data-line-number="8"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb304-9" data-line-number="9"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> -<a class="sourceLine" id="cb304-10" data-line-number="10">) <span class="op">+</span><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"PCA plot: raw log-counts"</span>) </a></code></pre></div> +<div class="sourceCode" id="cb313"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb313-1" data-line-number="1">tmp <-<span class="st"> </span><span class="kw">runPCA</span>(</a> +<a class="sourceLine" id="cb313-2" data-line-number="2"> umi.qc[endog_genes, ],</a> +<a class="sourceLine" id="cb313-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"logcounts_raw"</span></a> +<a class="sourceLine" id="cb313-4" data-line-number="4">)</a> +<a class="sourceLine" id="cb313-5" data-line-number="5"><span class="kw">plotPCA</span>(</a> +<a class="sourceLine" id="cb313-6" data-line-number="6"> tmp,</a> +<a class="sourceLine" id="cb313-7" data-line-number="7"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> +<a class="sourceLine" id="cb313-8" data-line-number="8"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb313-9" data-line-number="9"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> +<a class="sourceLine" id="cb313-10" data-line-number="10">) <span class="op">+</span><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"PCA plot: raw log-counts"</span>) </a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:norm-pca-raw"></span> <img src="exprs-norm_files/figure-html/norm-pca-raw-1.png" alt="PCA plot of the tung data" width="90%" /> <p class="caption"> @@ -864,35 +864,35 @@ Figure 7.1: PCA plot of the tung data </div> <div id="cpm-1" class="section level3"> <h3><span class="header-section-number">7.2.2</span> CPM</h3> -<div class="sourceCode" id="cb305"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb305-1" data-line-number="1"><span class="kw">logcounts</span>(umi.qc) <-<span class="st"> </span><span class="kw">log2</span>(<span class="kw">calculateCPM</span>(umi.qc, <span class="dt">use_size_factors =</span> <span class="ot">FALSE</span>) <span class="op">+</span><span class="st"> </span><span class="dv">1</span>)</a> -<a class="sourceLine" id="cb305-2" data-line-number="2"><span class="kw">plotPCA</span>(</a> -<a class="sourceLine" id="cb305-3" data-line-number="3"> umi.qc[endog_genes, ],</a> -<a class="sourceLine" id="cb305-4" data-line-number="4"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> -<a class="sourceLine" id="cb305-5" data-line-number="5"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb305-6" data-line-number="6"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> -<a class="sourceLine" id="cb305-7" data-line-number="7">) <span class="op">+</span><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"PCA plot: log2(CPM) values"</span>) </a></code></pre></div> +<div class="sourceCode" id="cb314"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb314-1" data-line-number="1"><span class="kw">logcounts</span>(umi.qc) <-<span class="st"> </span><span class="kw">log2</span>(<span class="kw">calculateCPM</span>(umi.qc, <span class="dt">use_size_factors =</span> <span class="ot">FALSE</span>) <span class="op">+</span><span class="st"> </span><span class="dv">1</span>)</a> +<a class="sourceLine" id="cb314-2" data-line-number="2"><span class="kw">plotPCA</span>(</a> +<a class="sourceLine" id="cb314-3" data-line-number="3"> umi.qc[endog_genes, ],</a> +<a class="sourceLine" id="cb314-4" data-line-number="4"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> +<a class="sourceLine" id="cb314-5" data-line-number="5"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb314-6" data-line-number="6"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> +<a class="sourceLine" id="cb314-7" data-line-number="7">) <span class="op">+</span><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"PCA plot: log2(CPM) values"</span>) </a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:norm-pca-cpm"></span> <img src="exprs-norm_files/figure-html/norm-pca-cpm-1.png" alt="PCA plot of the tung data after CPM normalisation" width="90%" /> <p class="caption"> Figure 7.2: PCA plot of the tung data after CPM normalisation </p> </div> -<div class="sourceCode" id="cb306"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb306-1" data-line-number="1"><span class="kw">plotRLE</span>(</a> -<a class="sourceLine" id="cb306-2" data-line-number="2"> umi.qc[endog_genes, ], </a> -<a class="sourceLine" id="cb306-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"logcounts_raw"</span>,</a> -<a class="sourceLine" id="cb306-4" data-line-number="4"> <span class="dt">colour_by =</span> <span class="st">"batch"</span></a> -<a class="sourceLine" id="cb306-5" data-line-number="5">) <span class="op">+</span><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"RLE plot: raw log-counts"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb315"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb315-1" data-line-number="1"><span class="kw">plotRLE</span>(</a> +<a class="sourceLine" id="cb315-2" data-line-number="2"> umi.qc[endog_genes, ], </a> +<a class="sourceLine" id="cb315-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"logcounts_raw"</span>,</a> +<a class="sourceLine" id="cb315-4" data-line-number="4"> <span class="dt">colour_by =</span> <span class="st">"batch"</span></a> +<a class="sourceLine" id="cb315-5" data-line-number="5">) <span class="op">+</span><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"RLE plot: raw log-counts"</span>)</a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:norm-ours-rle-cpm1"></span> <img src="exprs-norm_files/figure-html/norm-ours-rle-cpm-1.png" alt="Cell-wise RLE of the tung data. The relative log expression profile of each cell is represented by a boxplot, which appears as a line here. The grey bar in the middle for each cell represent the interquartile range of the RLE values; the coloured lines represent the whiskers ofof a boxplot and extend above and below the grey bar by 1.5 times the interquartile range. The median RLE value is shown with a circle." width="90%" /> <p class="caption"> Figure 7.3: Cell-wise RLE of the tung data. The relative log expression profile of each cell is represented by a boxplot, which appears as a line here. The grey bar in the middle for each cell represent the interquartile range of the RLE values; the coloured lines represent the whiskers ofof a boxplot and extend above and below the grey bar by 1.5 times the interquartile range. The median RLE value is shown with a circle. </p> </div> -<div class="sourceCode" id="cb307"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb307-1" data-line-number="1"><span class="kw">plotRLE</span>(</a> -<a class="sourceLine" id="cb307-2" data-line-number="2"> umi.qc[endog_genes, ], </a> -<a class="sourceLine" id="cb307-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"logcounts"</span>,</a> -<a class="sourceLine" id="cb307-4" data-line-number="4"> <span class="dt">colour_by =</span> <span class="st">"batch"</span></a> -<a class="sourceLine" id="cb307-5" data-line-number="5">) <span class="op">+</span><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"RLE plot: log2(CPM)"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb316"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb316-1" data-line-number="1"><span class="kw">plotRLE</span>(</a> +<a class="sourceLine" id="cb316-2" data-line-number="2"> umi.qc[endog_genes, ], </a> +<a class="sourceLine" id="cb316-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"logcounts"</span>,</a> +<a class="sourceLine" id="cb316-4" data-line-number="4"> <span class="dt">colour_by =</span> <span class="st">"batch"</span></a> +<a class="sourceLine" id="cb316-5" data-line-number="5">) <span class="op">+</span><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"RLE plot: log2(CPM)"</span>)</a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:norm-ours-rle-cpm2"></span> <img src="exprs-norm_files/figure-html/norm-ours-rle-cpm-2.png" alt="Cell-wise RLE of the tung data. The relative log expression profile of each cell is represented by a boxplot, which appears as a line here. The grey bar in the middle for each cell represent the interquartile range of the RLE values; the coloured lines represent the whiskers ofof a boxplot and extend above and below the grey bar by 1.5 times the interquartile range. The median RLE value is shown with a circle." width="90%" /> <p class="caption"> @@ -911,30 +911,26 @@ here as representative of size-factor normalization more generally.</p> <code>scater</code> applies those size factors along with the library sizes to the count matrix to produce normalized log2-counts-per-million values that are then stored in the <code>logcounts</code> slot of the <code>SingleCellExperiment</code> object.</p> -<div class="sourceCode" id="cb308"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb308-1" data-line-number="1">qclust <-<span class="st"> </span><span class="kw">quickCluster</span>(umi.qc, <span class="dt">min.size =</span> <span class="dv">30</span>, <span class="dt">use.ranks =</span> <span class="ot">FALSE</span>)</a> -<a class="sourceLine" id="cb308-2" data-line-number="2">umi.qc <-<span class="st"> </span><span class="kw">computeSumFactors</span>(umi.qc, <span class="dt">sizes =</span> <span class="dv">15</span>, <span class="dt">clusters =</span> qclust)</a> -<a class="sourceLine" id="cb308-3" data-line-number="3">umi.qc <-<span class="st"> </span><span class="kw">normalize</span>(umi.qc)</a></code></pre></div> -<pre><code>## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its -## own size factors</code></pre> -<pre><code>## Warning in .get_all_sf_sets(object): spike-in set 'MT' should have its own -## size factors</code></pre> -<div class="sourceCode" id="cb311"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb311-1" data-line-number="1"><span class="kw">plotPCA</span>(</a> -<a class="sourceLine" id="cb311-2" data-line-number="2"> umi.qc[endog_genes, ],</a> -<a class="sourceLine" id="cb311-3" data-line-number="3"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> -<a class="sourceLine" id="cb311-4" data-line-number="4"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb311-5" data-line-number="5"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> -<a class="sourceLine" id="cb311-6" data-line-number="6">) <span class="op">+</span><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"PCA plot: scran size-factor normalization"</span>) </a></code></pre></div> +<div class="sourceCode" id="cb317"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb317-1" data-line-number="1">qclust <-<span class="st"> </span><span class="kw">quickCluster</span>(umi.qc, <span class="dt">min.size =</span> <span class="dv">30</span>, <span class="dt">use.ranks =</span> <span class="ot">FALSE</span>)</a> +<a class="sourceLine" id="cb317-2" data-line-number="2">umi.qc <-<span class="st"> </span><span class="kw">computeSumFactors</span>(umi.qc, <span class="dt">sizes =</span> <span class="dv">15</span>, <span class="dt">clusters =</span> qclust)</a> +<a class="sourceLine" id="cb317-3" data-line-number="3">umi.qc <-<span class="st"> </span><span class="kw">normalize</span>(umi.qc)</a> +<a class="sourceLine" id="cb317-4" data-line-number="4"><span class="kw">plotPCA</span>(</a> +<a class="sourceLine" id="cb317-5" data-line-number="5"> umi.qc[endog_genes, ],</a> +<a class="sourceLine" id="cb317-6" data-line-number="6"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> +<a class="sourceLine" id="cb317-7" data-line-number="7"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb317-8" data-line-number="8"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> +<a class="sourceLine" id="cb317-9" data-line-number="9">) <span class="op">+</span><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"PCA plot: scran size-factor normalization"</span>) </a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:norm-pca-lsf"></span> <img src="exprs-norm_files/figure-html/norm-pca-lsf-1.png" alt="PCA plot of the tung data after LSF normalisation" width="90%" /> <p class="caption"> Figure 7.5: PCA plot of the tung data after LSF normalisation </p> </div> -<div class="sourceCode" id="cb312"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb312-1" data-line-number="1"><span class="kw">plotRLE</span>(</a> -<a class="sourceLine" id="cb312-2" data-line-number="2"> umi.qc[endog_genes, ], </a> -<a class="sourceLine" id="cb312-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"logcounts"</span>,</a> -<a class="sourceLine" id="cb312-4" data-line-number="4"> <span class="dt">colour_by =</span> <span class="st">"batch"</span></a> -<a class="sourceLine" id="cb312-5" data-line-number="5">) <span class="op">+</span><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"RLE plot: scran size-factor normalization"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb318"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb318-1" data-line-number="1"><span class="kw">plotRLE</span>(</a> +<a class="sourceLine" id="cb318-2" data-line-number="2"> umi.qc[endog_genes, ], </a> +<a class="sourceLine" id="cb318-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"logcounts"</span>,</a> +<a class="sourceLine" id="cb318-4" data-line-number="4"> <span class="dt">colour_by =</span> <span class="st">"batch"</span></a> +<a class="sourceLine" id="cb318-5" data-line-number="5">) <span class="op">+</span><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"RLE plot: scran size-factor normalization"</span>)</a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:norm-ours-rle-scran"></span> <img src="exprs-norm_files/figure-html/norm-ours-rle-scran-1.png" alt="Cell-wise RLE of the tung data" width="90%" /> <p class="caption"> @@ -944,7 +940,7 @@ Figure 7.6: Cell-wise RLE of the tung data <p><code>scran</code> sometimes calculates negative or zero size factors. These will completely distort the normalized expression matrix. We can check the size factors scran has computed like so:</p> -<div class="sourceCode" id="cb313"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb313-1" data-line-number="1"><span class="kw">summary</span>(<span class="kw">sizeFactors</span>(umi.qc))</a></code></pre></div> +<div class="sourceCode" id="cb319"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb319-1" data-line-number="1"><span class="kw">summary</span>(<span class="kw">sizeFactors</span>(umi.qc))</a></code></pre></div> <pre><code>## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.4836 0.7747 0.9532 1.0000 1.1483 3.2873</code></pre> <p>For this dataset all the size factors are reasonable so we are done. If you find @@ -960,116 +956,512 @@ negative binomial generalized linear model was introduced above. Here we demonstrate how to apply this method.</p> <p>Note that (due to what looks like a bug in this version of <code>sctransform</code>) we need to convert the UMI count matrix to a sparse format to apply sctransform.</p> -<div class="sourceCode" id="cb315"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb315-1" data-line-number="1">umi_sparse <-<span class="st"> </span><span class="kw">as</span>(<span class="kw">counts</span>(umi.qc), <span class="st">"dgCMatrix"</span>)</a> -<a class="sourceLine" id="cb315-2" data-line-number="2"><span class="co">### Genes expressed in at least 5 cells will be kept</span></a> -<a class="sourceLine" id="cb315-3" data-line-number="3">sctnorm_data <-<span class="st"> </span>sctransform<span class="op">::</span><span class="kw">vst</span>(<span class="dt">umi =</span> umi_sparse, <span class="dt">min_cells =</span> <span class="dv">1</span>,</a> -<a class="sourceLine" id="cb315-4" data-line-number="4"> <span class="dt">cell_attr =</span> <span class="kw">as.data.frame</span>(<span class="kw">colData</span>(umi.qc)),</a> -<a class="sourceLine" id="cb315-5" data-line-number="5"> <span class="dt">latent_var =</span> <span class="st">"log10_total_counts_endogenous"</span>)</a></code></pre></div> -<pre><code>## Calculating cell attributes for input UMI matrix</code></pre> -<pre><code>## Variance stabilizing transformation of count matrix of size 14066 by 657</code></pre> -<pre><code>## Model formula is y ~ log10_total_counts_endogenous</code></pre> -<pre><code>## Get Negative Binomial regression parameters per gene</code></pre> -<pre><code>## Using 2000 genes, 657 cells</code></pre> +<div class="sourceCode" id="cb321"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb321-1" data-line-number="1">umi_sparse <-<span class="st"> </span><span class="kw">as</span>(<span class="kw">counts</span>(umi.qc), <span class="st">"dgCMatrix"</span>)</a> +<a class="sourceLine" id="cb321-2" data-line-number="2"><span class="co">### Genes expressed in at least 5 cells will be kept</span></a> +<a class="sourceLine" id="cb321-3" data-line-number="3">sctnorm_data <-<span class="st"> </span>sctransform<span class="op">::</span><span class="kw">vst</span>(<span class="dt">umi =</span> umi_sparse, <span class="dt">min_cells =</span> <span class="dv">1</span>,</a> +<a class="sourceLine" id="cb321-4" data-line-number="4"> <span class="dt">cell_attr =</span> <span class="kw">as.data.frame</span>(<span class="kw">colData</span>(umi.qc)),</a> +<a class="sourceLine" id="cb321-5" data-line-number="5"> <span class="dt">latent_var =</span> <span class="st">"log10_total_counts_endogenous"</span>)</a></code></pre></div> <pre><code>## | - | | 0%</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + | | 0% + | + |======== | 12% + | + |================ | 25% + | + |======================== | 38% + | + |================================ | 50% + | + |========================================= | 62% + | + |================================================= | 75% + | + |========================================================= | 88% + | + |=================================================================| 100% +## + | + | | 0% + | + |= | 2% + | + |== | 4% + | + |==== | 5% + | + |===== | 7% + | + |====== | 9% + | + |======= | 11% + | + |======== | 13% + | + |========= | 15% + | + |=========== | 16% + | + |============ | 18% + | + |============= | 20% + | + |============== | 22% + | + |=============== | 24% + | + |================= | 25% + | + |================== | 27% + | + |=================== | 29% + | + |==================== | 31% + | + |===================== | 33% + | + |====================== | 35% + | + |======================== | 36% + | + |========================= | 38% + | + |========================== | 40% + | + |=========================== | 42% + | + |============================ | 44% + | + |============================== | 45% + | + |=============================== | 47% + | + |================================ | 49% + | + |================================= | 51% + | + |================================== | 53% + | + |=================================== | 55% + | + |===================================== | 56% + | + |====================================== | 58% + | + |======================================= | 60% + | + |======================================== | 62% + | + |========================================= | 64% + | + |=========================================== | 65% + | + |============================================ | 67% + | + |============================================= | 69% + | + |============================================== | 71% + | + |=============================================== | 73% + | + |================================================ | 75% + | + |================================================== | 76% + | + |=================================================== | 78% + | + |==================================================== | 80% + | + |===================================================== | 82% + | + |====================================================== | 84% + | + |======================================================== | 85% + | + |========================================================= | 87% + | + |========================================================== | 89% + | + |=========================================================== | 91% + | + |============================================================ | 93% + | + |============================================================= | 95% + | + |=============================================================== | 96% + | + |================================================================ | 98% + | + |=================================================================| 100%</code></pre> +<div class="sourceCode" id="cb323"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb323-1" data-line-number="1"><span class="co">## Pearson residuals, or deviance residuals</span></a> +<a class="sourceLine" id="cb323-2" data-line-number="2"><span class="kw">dim</span>(sctnorm_data<span class="op">$</span>y)</a></code></pre></div> +<pre><code>## [1] 14066 657</code></pre> +<div class="sourceCode" id="cb325"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb325-1" data-line-number="1"><span class="kw">dim</span>(umi.qc)</a></code></pre></div> +<pre><code>## [1] 14066 657</code></pre> +<div class="sourceCode" id="cb327"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb327-1" data-line-number="1">sctnorm_data<span class="op">$</span>model_str</a></code></pre></div> +<pre><code>## [1] "y ~ log10_total_counts_endogenous"</code></pre> +<div class="sourceCode" id="cb329"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb329-1" data-line-number="1"><span class="kw">assay</span>(umi.qc, <span class="st">"sctrans_norm"</span>) <-<span class="st"> </span>sctnorm_data<span class="op">$</span>y</a></code></pre></div> +<p>Let us look at the NB GLM model parameters estimated by sctransform.</p> +<div class="sourceCode" id="cb330"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb330-1" data-line-number="1"><span class="co">#sce$log10_total_counts</span></a> +<a class="sourceLine" id="cb330-2" data-line-number="2"><span class="co">## Matrix of estimated model parameters per gene (theta and regression coefficients)</span></a> +<a class="sourceLine" id="cb330-3" data-line-number="3">sctransform<span class="op">::</span><span class="kw">plot_model_pars</span>(sctnorm_data)</a></code></pre></div> +<p><img src="exprs-norm_files/figure-html/sctransform-params-plot-1.png" width="90%" style="display: block; margin: auto;" /></p> +<p>We can look at the effect of sctransform’s normalization on three particular +genes, ACTB, POU5F1 (aka OCT4) and CD74.</p> +<div class="sourceCode" id="cb331"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb331-1" data-line-number="1"><span class="co">##c('ACTB', 'Rpl10', 'Cd74')</span></a> +<a class="sourceLine" id="cb331-2" data-line-number="2">genes_plot <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"ENSG00000075624"</span>, <span class="st">"ENSG00000204531"</span>, <span class="st">"ENSG00000019582"</span>)</a> +<a class="sourceLine" id="cb331-3" data-line-number="3">sctransform<span class="op">::</span><span class="kw">plot_model</span>(sctnorm_data, umi_sparse, genes_plot, </a> +<a class="sourceLine" id="cb331-4" data-line-number="4"> <span class="dt">plot_residual =</span> <span class="ot">TRUE</span>, <span class="dt">cell_attr =</span> <span class="kw">as.data.frame</span>(<span class="kw">colData</span>(umi.qc)))</a></code></pre></div> +<p><img src="exprs-norm_files/figure-html/sctransform-genes-plot-1.png" width="90%" style="display: block; margin: auto;" /></p> +<div class="sourceCode" id="cb332"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb332-1" data-line-number="1"><span class="kw">reducedDim</span>(umi.qc, <span class="st">"PCA_sctrans_norm"</span>) <-<span class="st"> </span><span class="kw">reducedDim</span>(</a> +<a class="sourceLine" id="cb332-2" data-line-number="2"> <span class="kw">runPCA</span>(umi.qc[endog_genes, ], <span class="dt">exprs_values =</span> <span class="st">"sctrans_norm"</span>)</a> +<a class="sourceLine" id="cb332-3" data-line-number="3">)</a> +<a class="sourceLine" id="cb332-4" data-line-number="4"><span class="kw">plotReducedDim</span>(</a> +<a class="sourceLine" id="cb332-5" data-line-number="5"> umi.qc,</a> +<a class="sourceLine" id="cb332-6" data-line-number="6"> <span class="dt">use_dimred =</span> <span class="st">"PCA_sctrans_norm"</span>,</a> +<a class="sourceLine" id="cb332-7" data-line-number="7"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> +<a class="sourceLine" id="cb332-8" data-line-number="8"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb332-9" data-line-number="9"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> +<a class="sourceLine" id="cb332-10" data-line-number="10">) <span class="op">+</span><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"PCA plot: sctransform normalization"</span>) </a></code></pre></div> +<div class="figure" style="text-align: center"><span id="fig:norm-pca-sctransform"></span> +<img src="exprs-norm_files/figure-html/norm-pca-sctransform-1.png" alt="PCA plot of the tung data after sctransform normalisation (Pearson residuals)." width="90%" /> +<p class="caption"> +Figure 7.7: PCA plot of the tung data after sctransform normalisation (Pearson residuals). +</p> +</div> +<div class="sourceCode" id="cb333"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb333-1" data-line-number="1"><span class="kw">plotRLE</span>(</a> +<a class="sourceLine" id="cb333-2" data-line-number="2"> umi.qc[endog_genes, ], </a> +<a class="sourceLine" id="cb333-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"sctrans_norm"</span>,</a> +<a class="sourceLine" id="cb333-4" data-line-number="4"> <span class="dt">colour_by =</span> <span class="st">"batch"</span></a> +<a class="sourceLine" id="cb333-5" data-line-number="5">) <span class="op">+</span><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"RLE plot: sctransform normalization"</span>)</a></code></pre></div> +<div class="figure" style="text-align: center"><span id="fig:norm-ours-rle-sctransform"></span> +<img src="exprs-norm_files/figure-html/norm-ours-rle-sctransform-1.png" alt="Cell-wise RLE of the tung data" width="90%" /> +<p class="caption"> +Figure 7.8: Cell-wise RLE of the tung data +</p> +</div> +</div> +<div id="normalisation-for-genetranscript-length" class="section level3"> +<h3><span class="header-section-number">7.2.5</span> Normalisation for gene/transcript length</h3> +<p>Some methods combine library size and fragment/gene length normalization such +as:</p> +<ul> +<li><strong>RPKM</strong> - Reads Per Kilobase Million (for single-end sequencing)</li> +<li><strong>FPKM</strong> - Fragments Per Kilobase Million (same as <strong>RPKM</strong> but for paired-end +sequencing, makes sure that paired ends mapped to the same fragment are not +counted twice)</li> +<li><strong>TPM</strong> - Transcripts Per Kilobase Million (same as <strong>RPKM</strong>, but the order of +normalizations is reversed - length first and sequencing depth second)</li> +</ul> +<p>These methods are not applicable to our dataset since the end +of the transcript which contains the UMI was preferentially +sequenced. Furthermore in general these should only be calculated +using appropriate quantification software from aligned BAM files not +from read counts since often only a portion of the entire +gene/transcript is sequenced, not the entire length. If in doubt check +for a relationship between gene/transcript length and expression level.</p> +<p>However, here we show how these normalisations can be calculated using <code>scater</code>. +First, we need to find the effective transcript length in Kilobases. However, +our dataset containes only gene IDs, therefore we will be using the gene lengths +instead of transcripts. <code>scater</code> uses the +<a href="https://bioconductor.org/packages/release/bioc/html/biomaRt.html">biomaRt</a> +package, which allows one to annotate genes by other attributes:</p> +<div class="sourceCode" id="cb334"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb334-1" data-line-number="1">umi.qc <-<span class="st"> </span><span class="kw">getBMFeatureAnnos</span>(</a> +<a class="sourceLine" id="cb334-2" data-line-number="2"> umi.qc,</a> +<a class="sourceLine" id="cb334-3" data-line-number="3"> <span class="dt">filters =</span> <span class="st">"ensembl_gene_id"</span>, </a> +<a class="sourceLine" id="cb334-4" data-line-number="4"> <span class="dt">attributes =</span> <span class="kw">c</span>(</a> +<a class="sourceLine" id="cb334-5" data-line-number="5"> <span class="st">"ensembl_gene_id"</span>,</a> +<a class="sourceLine" id="cb334-6" data-line-number="6"> <span class="st">"hgnc_symbol"</span>,</a> +<a class="sourceLine" id="cb334-7" data-line-number="7"> <span class="st">"chromosome_name"</span>,</a> +<a class="sourceLine" id="cb334-8" data-line-number="8"> <span class="st">"start_position"</span>,</a> +<a class="sourceLine" id="cb334-9" data-line-number="9"> <span class="st">"end_position"</span></a> +<a class="sourceLine" id="cb334-10" data-line-number="10"> ), </a> +<a class="sourceLine" id="cb334-11" data-line-number="11"> <span class="dt">biomart =</span> <span class="st">"ENSEMBL_MART_ENSEMBL"</span>, </a> +<a class="sourceLine" id="cb334-12" data-line-number="12"> <span class="dt">dataset =</span> <span class="st">"hsapiens_gene_ensembl"</span>,</a> +<a class="sourceLine" id="cb334-13" data-line-number="13"> <span class="dt">host =</span> <span class="st">"www.ensembl.org"</span></a> +<a class="sourceLine" id="cb334-14" data-line-number="14">)</a> +<a class="sourceLine" id="cb334-15" data-line-number="15"></a> +<a class="sourceLine" id="cb334-16" data-line-number="16"><span class="co"># If you have mouse data, change the arguments based on this example:</span></a> +<a class="sourceLine" id="cb334-17" data-line-number="17"><span class="co"># getBMFeatureAnnos(</span></a> +<a class="sourceLine" id="cb334-18" data-line-number="18"><span class="co"># object,</span></a> +<a class="sourceLine" id="cb334-19" data-line-number="19"><span class="co"># filters = "ensembl_transcript_id",</span></a> +<a class="sourceLine" id="cb334-20" data-line-number="20"><span class="co"># attributes = c(</span></a> +<a class="sourceLine" id="cb334-21" data-line-number="21"><span class="co"># "ensembl_transcript_id",</span></a> +<a class="sourceLine" id="cb334-22" data-line-number="22"><span class="co"># "ensembl_gene_id", </span></a> +<a class="sourceLine" id="cb334-23" data-line-number="23"><span class="co"># "mgi_symbol",</span></a> +<a class="sourceLine" id="cb334-24" data-line-number="24"><span class="co"># "chromosome_name",</span></a> +<a class="sourceLine" id="cb334-25" data-line-number="25"><span class="co"># "transcript_biotype",</span></a> +<a class="sourceLine" id="cb334-26" data-line-number="26"><span class="co"># "transcript_start",</span></a> +<a class="sourceLine" id="cb334-27" data-line-number="27"><span class="co"># "transcript_end",</span></a> +<a class="sourceLine" id="cb334-28" data-line-number="28"><span class="co"># "transcript_count"</span></a> +<a class="sourceLine" id="cb334-29" data-line-number="29"><span class="co"># ),</span></a> +<a class="sourceLine" id="cb334-30" data-line-number="30"><span class="co"># biomart = "ENSEMBL_MART_ENSEMBL",</span></a> +<a class="sourceLine" id="cb334-31" data-line-number="31"><span class="co"># dataset = "mmusculus_gene_ensembl",</span></a> +<a class="sourceLine" id="cb334-32" data-line-number="32"><span class="co"># host = "www.ensembl.org"</span></a> +<a class="sourceLine" id="cb334-33" data-line-number="33"><span class="co"># )</span></a></code></pre></div> +<p>Some of the genes were not annotated, therefore we filter them out:</p> +<div class="sourceCode" id="cb335"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb335-1" data-line-number="1">umi.qc.ann <-<span class="st"> </span>umi.qc[<span class="op">!</span><span class="kw">is.na</span>(<span class="kw">rowData</span>(umi.qc)<span class="op">$</span>ensembl_gene_id), ]</a></code></pre></div> +<p>Now we compute the total gene length in Kilobases by using the <code>end_position</code> +and <code>start_position</code> fields:</p> +<div class="sourceCode" id="cb336"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb336-1" data-line-number="1">eff_length <-<span class="st"> </span></a> +<a class="sourceLine" id="cb336-2" data-line-number="2"><span class="st"> </span><span class="kw">abs</span>(<span class="kw">rowData</span>(umi.qc.ann)<span class="op">$</span>end_position <span class="op">-</span><span class="st"> </span><span class="kw">rowData</span>(umi.qc.ann)<span class="op">$</span>start_position) <span class="op">/</span><span class="st"> </span><span class="dv">1000</span></a></code></pre></div> +<div class="sourceCode" id="cb337"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb337-1" data-line-number="1"><span class="kw">plot</span>(eff_length, <span class="kw">rowMeans</span>(<span class="kw">counts</span>(umi.qc.ann)))</a></code></pre></div> +<p>There is no relationship between gene length and mean expression so __FPKM__s & +__TPM__s are inappropriate for this dataset. This is what we would expect for +UMI protocols that tag one end of the transcript. But we will demonstrate them +anyway.</p> +<p><strong>Note</strong> Here calculate the total gene length instead of the total exon length. +Many genes will contain lots of introns so their <code>eff_length</code> will be very +different from what we have calculated. Please consider our calculation as +approximation. If you want to use the total exon lengths, please refer to <a href="https://www.biostars.org/p/83901/">this +page</a>.</p> +<p>Now we are ready to perform the normalisations:</p> +<div class="sourceCode" id="cb338"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb338-1" data-line-number="1"><span class="kw">tpm</span>(umi.qc.ann) <-<span class="st"> </span><span class="kw">log2</span>(<span class="kw">calculateTPM</span>(umi.qc.ann, eff_length) <span class="op">+</span><span class="st"> </span><span class="dv">1</span>)</a></code></pre></div> +<p>Plot the results as a PCA plot:</p> +<div class="sourceCode" id="cb339"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb339-1" data-line-number="1">tmp <-<span class="st"> </span><span class="kw">runPCA</span>(</a> +<a class="sourceLine" id="cb339-2" data-line-number="2"> umi.qc.ann,</a> +<a class="sourceLine" id="cb339-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"tpm"</span>,</a> +<a class="sourceLine" id="cb339-4" data-line-number="4">)</a> +<a class="sourceLine" id="cb339-5" data-line-number="5"><span class="kw">plotPCA</span>(</a> +<a class="sourceLine" id="cb339-6" data-line-number="6"> tmp,</a> +<a class="sourceLine" id="cb339-7" data-line-number="7"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> +<a class="sourceLine" id="cb339-8" data-line-number="8"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb339-9" data-line-number="9"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> +<a class="sourceLine" id="cb339-10" data-line-number="10">)</a></code></pre></div> +<div class="sourceCode" id="cb340"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb340-1" data-line-number="1"><span class="kw">tpm</span>(umi.qc.ann) <-<span class="st"> </span><span class="kw">log2</span>(<span class="kw">calculateFPKM</span>(umi.qc.ann, eff_length) <span class="op">+</span><span class="st"> </span><span class="dv">1</span>)</a></code></pre></div> +<div class="sourceCode" id="cb341"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb341-1" data-line-number="1">tmp <-<span class="st"> </span><span class="kw">runPCA</span>(</a> +<a class="sourceLine" id="cb341-2" data-line-number="2"> umi.qc.ann,</a> +<a class="sourceLine" id="cb341-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"tpm"</span>,</a> +<a class="sourceLine" id="cb341-4" data-line-number="4">)</a> +<a class="sourceLine" id="cb341-5" data-line-number="5"><span class="kw">plotPCA</span>(</a> +<a class="sourceLine" id="cb341-6" data-line-number="6"> tmp,</a> +<a class="sourceLine" id="cb341-7" data-line-number="7"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> +<a class="sourceLine" id="cb341-8" data-line-number="8"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb341-9" data-line-number="9"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> +<a class="sourceLine" id="cb341-10" data-line-number="10">)</a></code></pre></div> +<p><strong>Note</strong> The <code>PCA</code> looks for differences between cells. Gene length is the same +across cells for each gene thus <strong>FPKM</strong> is almost identical to the <strong>CPM</strong> plot +(it is just rotated) since it performs <strong>CPM</strong> first then normalizes gene +length. Whereas, <strong>TPM</strong> is different because it weights genes by their length +before performing __CPM_**.</p> +</div> +<div id="reflection" class="section level3"> +<h3><span class="header-section-number">7.2.6</span> Reflection</h3> +<p><strong>Q:</strong> What is your assessment of the performance of these different +normalization methods on the data presented here?</p> +<p><strong>Q:</strong> Which normalization method would you prefer for this dataset? Why?</p> +</div> +<div id="exercise" class="section level3"> +<h3><span class="header-section-number">7.2.7</span> Exercise</h3> +<p>Perform the same analysis with read counts of the <code>tung</code> data. Use +<code>tung/reads.rds</code> file to load the reads <code>SCE</code> object. Once you have finished +please compare your results to ours (next chapter).</p> +</div> +<div id="sessioninfo-2" class="section level3"> +<h3><span class="header-section-number">7.2.8</span> sessionInfo()</h3> +<pre><code>## R version 3.6.0 (2019-04-26) +## Platform: x86_64-pc-linux-gnu (64-bit) +## Running under: Ubuntu 18.04.3 LTS +## +## Matrix products: default +## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 +## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 +## +## locale: +## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C +## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 +## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 +## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C +## [9] LC_ADDRESS=C LC_TELEPHONE=C +## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C +## +## attached base packages: +## [1] parallel stats4 stats graphics grDevices utils datasets +## [8] methods base +## +## other attached packages: +## [1] scran_1.12.1 scater_1.12.2 +## [3] ggplot2_3.2.1 SingleCellExperiment_1.6.0 +## [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 +## [7] BiocParallel_1.18.1 matrixStats_0.55.0 +## [9] Biobase_2.44.0 GenomicRanges_1.36.1 +## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 +## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 +## [15] scRNA.seq.funcs_0.1.0 +## +## loaded via a namespace (and not attached): +## [1] viridis_0.5.1 dynamicTreeCut_1.63-1 +## [3] edgeR_3.26.8 BiocSingular_1.0.0 +## [5] viridisLite_0.3.0 DelayedMatrixStats_1.6.1 +## [7] elliptic_1.4-0 moments_0.14 +## [9] assertthat_0.2.1 statmod_1.4.32 +## [11] highr_0.8 dqrng_0.2.1 +## [13] GenomeInfoDbData_1.2.1 vipor_0.4.5 +## [15] yaml_2.2.0 globals_0.12.4 +## [17] pillar_1.4.2 lattice_0.20-38 +## [19] glue_1.3.1 limma_3.40.6 +## [21] digest_0.6.21 XVector_0.24.0 +## [23] colorspace_1.4-1 plyr_1.8.4 +## [25] cowplot_1.0.0 htmltools_0.3.6 +## [27] Matrix_1.2-17 pkgconfig_2.0.3 +## [29] listenv_0.7.0 bookdown_0.13 +## [31] zlibbioc_1.30.0 purrr_0.3.2 +## [33] scales_1.0.0 Rtsne_0.15 +## [35] tibble_2.1.3 withr_2.1.2 +## [37] lazyeval_0.2.2 magrittr_1.5 +## [39] crayon_1.3.4 evaluate_0.14 +## [41] future_1.14.0 MASS_7.3-51.1 +## [43] beeswarm_0.2.3 tools_3.6.0 +## [45] stringr_1.4.0 locfit_1.5-9.1 +## [47] munsell_0.5.0 irlba_2.3.3 +## [49] orthopolynom_1.0-5 compiler_3.6.0 +## [51] rsvd_1.0.2 contfrac_1.1-12 +## [53] rlang_0.4.0 grid_3.6.0 +## [55] RCurl_1.95-4.12 BiocNeighbors_1.2.0 +## [57] igraph_1.2.4.1 labeling_0.3 +## [59] bitops_1.0-6 rmarkdown_1.15 +## [61] codetools_0.2-16 hypergeo_1.2-13 +## [63] gtable_0.3.0 deSolve_1.24 +## [65] reshape2_1.4.3 R6_2.4.0 +## [67] gridExtra_2.3 knitr_1.25 +## [69] dplyr_0.8.3 future.apply_1.3.0 +## [71] stringi_1.4.3 ggbeeswarm_0.6.0 +## [73] Rcpp_1.0.2 sctransform_0.2.0 +## [75] tidyselect_0.2.5 xfun_0.9</code></pre> -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +</div> +</div> +<div id="normalization-practice-reads" class="section level2"> +<h2><span class="header-section-number">7.3</span> Normalization practice (Reads)</h2> +<div class="figure" style="text-align: center"><span id="fig:norm-pca-raw-reads"></span> +<img src="exprs-norm-reads_files/figure-html/norm-pca-raw-reads-1.png" alt="PCA plot of the tung data" width="90%" /> +<p class="caption"> +Figure 7.9: PCA plot of the tung data +</p> +</div> +<div class="figure" style="text-align: center"><span id="fig:norm-pca-cpm-reads"></span> +<img src="exprs-norm-reads_files/figure-html/norm-pca-cpm-reads-1.png" alt="PCA plot of the tung data after CPM normalisation" width="90%" /> +<p class="caption"> +Figure 7.10: PCA plot of the tung data after CPM normalisation +</p> +</div> +<div class="figure" style="text-align: center"><span id="fig:norm-ours-rle-cpm-reads1"></span> +<img src="exprs-norm-reads_files/figure-html/norm-ours-rle-cpm-reads-1.png" alt="Cell-wise RLE of the tung data" width="90%" /> +<p class="caption"> +Figure 7.11: Cell-wise RLE of the tung data +</p> +</div> +<div class="figure" style="text-align: center"><span id="fig:norm-ours-rle-cpm-reads2"></span> +<img src="exprs-norm-reads_files/figure-html/norm-ours-rle-cpm-reads-2.png" alt="Cell-wise RLE of the tung data" width="90%" /> +<p class="caption"> +Figure 7.12: Cell-wise RLE of the tung data +</p> +</div> +<pre><code>## Warning: Setting 'use.ranks=TRUE' for the old defaults. +## Set 'use.ranks=FALSE' for the new defaults.</code></pre> +<pre><code>## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its +## own size factors</code></pre> +<pre><code>## Warning in .get_all_sf_sets(object): spike-in set 'MT' should have its own +## size factors</code></pre> +<div class="figure" style="text-align: center"><span id="fig:norm-pca-lsf-umi"></span> +<img src="exprs-norm-reads_files/figure-html/norm-pca-lsf-umi-1.png" alt="PCA plot of the tung data after LSF normalisation" width="90%" /> +<p class="caption"> +Figure 7.13: PCA plot of the tung data after LSF normalisation +</p> +</div> +<div class="figure" style="text-align: center"><span id="fig:norm-ours-rle-scran-reads1"></span> +<img src="exprs-norm-reads_files/figure-html/norm-ours-rle-scran-reads-1.png" alt="Cell-wise RLE of the tung data" width="90%" /> +<p class="caption"> +Figure 7.14: Cell-wise RLE of the tung data +</p> +</div> +<div class="figure" style="text-align: center"><span id="fig:norm-ours-rle-scran-reads2"></span> +<img src="exprs-norm-reads_files/figure-html/norm-ours-rle-scran-reads-2.png" alt="Cell-wise RLE of the tung data" width="90%" /> +<p class="caption"> +Figure 7.15: Cell-wise RLE of the tung data +</p> +</div> +<pre><code>## Calculating cell attributes for input UMI matrix</code></pre> +<pre><code>## Variance stabilizing transformation of count matrix of size 16062 by 606</code></pre> +<pre><code>## Model formula is y ~ log10_total_counts_endogenous</code></pre> +<pre><code>## Get Negative Binomial regression parameters per gene</code></pre> +<pre><code>## Using 2000 genes, 606 cells</code></pre> +<pre><code>## + | + | | 0%</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## + | + |======== | 12%</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## + | + |================ | 25%</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached @@ -1077,9 +1469,15 @@ need to convert the UMI count matrix to a sparse format to apply sctransform.</p ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached @@ -1091,11 +1489,25 @@ need to convert the UMI count matrix to a sparse format to apply sctransform.</p ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## + | + |======================== | 38%</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached @@ -1105,15 +1517,37 @@ need to convert the UMI count matrix to a sparse format to apply sctransform.</p ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## + | + |================================ | 50%</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## + | + |========================================= | 62%</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached @@ -1123,453 +1557,915 @@ need to convert the UMI count matrix to a sparse format to apply sctransform.</p ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## + | + |================================================= | 75%</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## + | + |========================================================= | 88%</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> <pre><code>## | - |======== | 12%</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + |=================================================================| 100%</code></pre> +<pre><code>## Found 1 outliers - those will be ignored in fitting/regularization step</code></pre> +<pre><code>## Second step: Get residuals using fitted parameters for 16062 genes</code></pre> +<pre><code>## + | + | | 0% + | + |= | 2% + | + |== | 3% + | + |=== | 5% + | + |==== | 6% + | + |===== | 8% + | + |====== | 10% + | + |======= | 11% + | + |======== | 13% + | + |========= | 14% + | + |========== | 16% + | + |=========== | 17% + | + |============ | 19% + | + |============= | 21% + | + |============== | 22% + | + |=============== | 24% + | + |================= | 25% + | + |================== | 27% + | + |=================== | 29% + | + |==================== | 30% + | + |===================== | 32% + | + |====================== | 33% + | + |======================= | 35% + | + |======================== | 37% + | + |========================= | 38% + | + |========================== | 40% + | + |=========================== | 41% + | + |============================ | 43% + | + |============================= | 44% + | + |============================== | 46% + | + |=============================== | 48% + | + |================================ | 49% + | + |================================= | 51% + | + |================================== | 52% + | + |=================================== | 54% + | + |==================================== | 56% + | + |===================================== | 57% + | + |====================================== | 59% + | + |======================================= | 60% + | + |======================================== | 62% + | + |========================================= | 63% + | + |========================================== | 65% + | + |=========================================== | 67% + | + |============================================ | 68% + | + |============================================= | 70% + | + |============================================== | 71% + | + |=============================================== | 73% + | + |================================================ | 75% + | + |================================================== | 76% + | + |=================================================== | 78% + | + |==================================================== | 79% + | + |===================================================== | 81% + | + |====================================================== | 83% + | + |======================================================= | 84% + | + |======================================================== | 86% + | + |========================================================= | 87% + | + |========================================================== | 89% + | + |=========================================================== | 90% + | + |============================================================ | 92% + | + |============================================================= | 94% + | + |============================================================== | 95% + | + |=============================================================== | 97% + | + |================================================================ | 98% + | + |=================================================================| 100%</code></pre> +<pre><code>## Calculating gene attributes</code></pre> +<pre><code>## Wall clock passed: Time difference of 16.15881 secs</code></pre> +<pre><code>## [1] 16062 606</code></pre> +<pre><code>## [1] 16062 606</code></pre> +<pre><code>## [1] "y ~ log10_total_counts_endogenous"</code></pre> +<p>Let us look at the NB GLM model parameters estimated by sctransform.</p> +<p><img src="exprs-norm-reads_files/figure-html/norm-ours-sctransform-params-plot-reads-1.png" width="90%" style="display: block; margin: auto;" /></p> +<p>We can look at the effect of sctransform’s normalization on three particular +genes, ACTB, POU5F1 (aka OCT4) and CD74.</p> +<p><img src="exprs-norm-reads_files/figure-html/norm-ours-sctransform-genes-plot-reads-1.png" width="90%" style="display: block; margin: auto;" /></p> +<div class="figure" style="text-align: center"><span id="fig:norm-ours-pca-sctransform-reads"></span> +<img src="exprs-norm-reads_files/figure-html/norm-ours-pca-sctransform-reads-1.png" alt="PCA plot of the tung reads data after sctransform normalisation (Pearson residuals)." width="90%" /> +<p class="caption"> +Figure 7.16: PCA plot of the tung reads data after sctransform normalisation (Pearson residuals). +</p> +</div> +<div class="figure" style="text-align: center"><span id="fig:norm-ours-rle-sctransform-reads"></span> +<img src="exprs-norm-reads_files/figure-html/norm-ours-rle-sctransform-reads-1.png" alt="Cell-wise RLE of the tung reads data" width="90%" /> +<p class="caption"> +Figure 7.17: Cell-wise RLE of the tung reads data +</p> +</div> +<div class="figure" style="text-align: center"><span id="fig:norm-pca-tpm-reads"></span> +<img src="exprs-norm-reads_files/figure-html/norm-pca-tpm-reads-1.png" alt="PCA plot of the tung data after TPM normalisation" width="90%" /> +<p class="caption"> +Figure 7.18: PCA plot of the tung data after TPM normalisation +</p> +</div> +<pre><code>## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its +## own size factors</code></pre> +<pre><code>## Warning in .get_all_sf_sets(object): spike-in set 'MT' should have its own +## size factors</code></pre> +<pre><code>## R version 3.6.0 (2019-04-26) +## Platform: x86_64-pc-linux-gnu (64-bit) +## Running under: Ubuntu 18.04.3 LTS +## +## Matrix products: default +## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 +## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 +## +## locale: +## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C +## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 +## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 +## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C +## [9] LC_ADDRESS=C LC_TELEPHONE=C +## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C +## +## attached base packages: +## [1] parallel stats4 stats graphics grDevices utils datasets +## [8] methods base +## +## other attached packages: +## [1] scran_1.12.1 scater_1.12.2 +## [3] ggplot2_3.2.1 SingleCellExperiment_1.6.0 +## [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 +## [7] BiocParallel_1.18.1 matrixStats_0.55.0 +## [9] Biobase_2.44.0 GenomicRanges_1.36.1 +## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 +## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 +## [15] scRNA.seq.funcs_0.1.0 +## +## loaded via a namespace (and not attached): +## [1] bitops_1.0-6 bit64_0.9-7 +## [3] httr_1.4.1 progress_1.2.2 +## [5] dynamicTreeCut_1.63-1 backports_1.1.4 +## [7] sctransform_0.2.0 tools_3.6.0 +## [9] R6_2.4.0 irlba_2.3.3 +## [11] hypergeo_1.2-13 vipor_0.4.5 +## [13] DBI_1.0.0 lazyeval_0.2.2 +## [15] colorspace_1.4-1 withr_2.1.2 +## [17] prettyunits_1.0.2 tidyselect_0.2.5 +## [19] gridExtra_2.3 moments_0.14 +## [21] curl_4.2 bit_1.1-14 +## [23] compiler_3.6.0 orthopolynom_1.0-5 +## [25] BiocNeighbors_1.2.0 labeling_0.3 +## [27] bookdown_0.13 scales_1.0.0 +## [29] stringr_1.4.0 digest_0.6.21 +## [31] rmarkdown_1.15 XVector_0.24.0 +## [33] pkgconfig_2.0.3 htmltools_0.3.6 +## [35] limma_3.40.6 highr_0.8 +## [37] rlang_0.4.0 RSQLite_2.1.2 +## [39] DelayedMatrixStats_1.6.1 dplyr_0.8.3 +## [41] RCurl_1.95-4.12 magrittr_1.5 +## [43] BiocSingular_1.0.0 GenomeInfoDbData_1.2.1 +## [45] Matrix_1.2-17 Rcpp_1.0.2 +## [47] ggbeeswarm_0.6.0 munsell_0.5.0 +## [49] viridis_0.5.1 stringi_1.4.3 +## [51] yaml_2.2.0 edgeR_3.26.8 +## [53] MASS_7.3-51.1 zlibbioc_1.30.0 +## [55] Rtsne_0.15 plyr_1.8.4 +## [57] blob_1.2.0 grid_3.6.0 +## [59] listenv_0.7.0 dqrng_0.2.1 +## [61] crayon_1.3.4 contfrac_1.1-12 +## [63] lattice_0.20-38 cowplot_1.0.0 +## [65] hms_0.5.1 locfit_1.5-9.1 +## [67] zeallot_0.1.0 knitr_1.25 +## [69] pillar_1.4.2 igraph_1.2.4.1 +## [71] future.apply_1.3.0 reshape2_1.4.3 +## [73] codetools_0.2-16 biomaRt_2.40.4 +## [75] XML_3.98-1.20 glue_1.3.1 +## [77] evaluate_0.14 deSolve_1.24 +## [79] vctrs_0.2.0 gtable_0.3.0 +## [81] purrr_0.3.2 future_1.14.0 +## [83] assertthat_0.2.1 xfun_0.9 +## [85] rsvd_1.0.2 viridisLite_0.3.0 +## [87] tibble_2.1.3 elliptic_1.4-0 +## [89] memoise_1.1.0 AnnotationDbi_1.46.1 +## [91] beeswarm_0.2.3 globals_0.12.4 +## [93] statmod_1.4.32</code></pre> -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +</div> +<div id="identifying-confounding-factors" class="section level2"> +<h2><span class="header-section-number">7.4</span> Identifying confounding factors</h2> +<div id="introduction-5" class="section level3"> +<h3><span class="header-section-number">7.4.1</span> Introduction</h3> +<p>There is a large number of potential confounders, artifacts and biases in +scRNA-seq data. One of the main challenges in analysing scRNA-seq data stems +from the fact that it is difficult to carry out a true technical replication +(why?) to distinguish biological and technical variability. In the previous +chapters we considered normalization and in this chapter we will continue to +explore how experimental artifacts can be identified and removed. We will +continue using the <code>scater</code> package since it provides a set of methods +specifically for quality control of experimental and explanatory variables. +Moreover, we will continue to work with the Blischak data that was used in the +previous chapter.</p> +<div class="sourceCode" id="cb454"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb454-1" data-line-number="1"><span class="kw">library</span>(scater, <span class="dt">quietly =</span> <span class="ot">TRUE</span>)</a> +<a class="sourceLine" id="cb454-2" data-line-number="2"><span class="kw">library</span>(scran)</a> +<a class="sourceLine" id="cb454-3" data-line-number="3"><span class="kw">options</span>(<span class="dt">stringsAsFactors =</span> <span class="ot">FALSE</span>)</a> +<a class="sourceLine" id="cb454-4" data-line-number="4">umi <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/tung/umi.rds"</span>)</a> +<a class="sourceLine" id="cb454-5" data-line-number="5">umi.qc <-<span class="st"> </span>umi[<span class="kw">rowData</span>(umi)<span class="op">$</span>use, <span class="kw">colData</span>(umi)<span class="op">$</span>use]</a> +<a class="sourceLine" id="cb454-6" data-line-number="6">endog_genes <-<span class="st"> </span><span class="op">!</span><span class="kw">rowData</span>(umi.qc)<span class="op">$</span>is_feature_control</a></code></pre></div> +<p>The <code>umi.qc</code> dataset contains filtered cells and genes. Our next step is to +explore technical drivers of variability in the data to inform data +normalisation before downstream analysis.</p> +</div> +<div id="correlations-with-pcs" class="section level3"> +<h3><span class="header-section-number">7.4.2</span> Correlations with PCs</h3> +<p>Let’s first look again at the PCA plot of the QCed dataset using the +scran-normalized log2-CPM values:</p> +<div class="sourceCode" id="cb455"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb455-1" data-line-number="1">qclust <-<span class="st"> </span><span class="kw">quickCluster</span>(umi.qc, <span class="dt">min.size =</span> <span class="dv">30</span>, <span class="dt">use.ranks =</span> <span class="ot">FALSE</span>)</a> +<a class="sourceLine" id="cb455-2" data-line-number="2">umi.qc <-<span class="st"> </span><span class="kw">computeSumFactors</span>(umi.qc, <span class="dt">sizes =</span> <span class="dv">15</span>, <span class="dt">clusters =</span> qclust)</a> +<a class="sourceLine" id="cb455-3" data-line-number="3">umi.qc <-<span class="st"> </span><span class="kw">normalize</span>(umi.qc)</a></code></pre></div> +<pre><code>## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its +## own size factors</code></pre> +<pre><code>## Warning in .get_all_sf_sets(object): spike-in set 'MT' should have its own +## size factors</code></pre> +<div class="sourceCode" id="cb458"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb458-1" data-line-number="1"><span class="kw">reducedDim</span>(umi.qc, <span class="st">"PCA"</span>) <-<span class="st"> </span><span class="kw">reducedDim</span>(</a> +<a class="sourceLine" id="cb458-2" data-line-number="2"> <span class="kw">runPCA</span>(umi.qc[endog_genes,],</a> +<a class="sourceLine" id="cb458-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"logcounts"</span>, <span class="dt">ncomponents =</span> <span class="dv">10</span>), <span class="st">"PCA"</span>)</a> +<a class="sourceLine" id="cb458-4" data-line-number="4"><span class="kw">plotPCA</span>(</a> +<a class="sourceLine" id="cb458-5" data-line-number="5"> umi.qc,</a> +<a class="sourceLine" id="cb458-6" data-line-number="6"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> +<a class="sourceLine" id="cb458-7" data-line-number="7"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span></a> +<a class="sourceLine" id="cb458-8" data-line-number="8">)</a></code></pre></div> +<div class="figure" style="text-align: center"><span id="fig:confound-pca"></span> +<img src="confounders_files/figure-html/confound-pca-1.png" alt="PCA plot of the tung data" width="90%" /> +<p class="caption"> +Figure 7.19: PCA plot of the tung data +</p> +</div> +<p><code>scater</code> allows one to identify principal components that correlate with +experimental and QC variables of interest (it ranks principle components by +<span class="math inline">\(R^2\)</span> from a linear model regressing PC value against the variable of interest).</p> +<p>Let’s test whether some of the variables correlate with any of the PCs.</p> +<div id="top-coldata-variables-associated-with-pcs" class="section level4"> +<h4><span class="header-section-number">7.4.2.1</span> Top colData variables associated with PCs</h4> +<p>The plot below shows, for each of the first 10 PCs, the variance explained by +the ten variables in <code>colData(umi.qc)</code> that are most strongly associated with +the PCs. [We will ignore the <code>sample_id</code> variable: it has a unique value for +each cell, so can explain all the variation for all PCs.]</p> +<div class="sourceCode" id="cb459"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb459-1" data-line-number="1"><span class="kw">plotExplanatoryPCs</span>(umi.qc)</a></code></pre></div> +<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): +## ignoring 'is_cell_control' with fewer than 2 unique levels</code></pre> +<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): +## ignoring 'pct_counts_in_top_100_features_feature_control' with fewer than 2 +## unique levels</code></pre> +<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): +## ignoring 'pct_counts_in_top_200_features_feature_control' with fewer than 2 +## unique levels</code></pre> +<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): +## ignoring 'pct_counts_in_top_500_features_feature_control' with fewer than 2 +## unique levels</code></pre> +<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): +## ignoring 'pct_counts_in_top_50_features_ERCC' with fewer than 2 unique +## levels</code></pre> +<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): +## ignoring 'pct_counts_in_top_100_features_ERCC' with fewer than 2 unique +## levels</code></pre> +<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): +## ignoring 'pct_counts_in_top_200_features_ERCC' with fewer than 2 unique +## levels</code></pre> +<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): +## ignoring 'pct_counts_in_top_500_features_ERCC' with fewer than 2 unique +## levels</code></pre> +<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): +## ignoring 'pct_counts_in_top_50_features_MT' with fewer than 2 unique levels</code></pre> +<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): +## ignoring 'pct_counts_in_top_100_features_MT' with fewer than 2 unique +## levels</code></pre> +<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): +## ignoring 'pct_counts_in_top_200_features_MT' with fewer than 2 unique +## levels</code></pre> +<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): +## ignoring 'pct_counts_in_top_500_features_MT' with fewer than 2 unique +## levels</code></pre> +<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): +## ignoring 'use' with fewer than 2 unique levels</code></pre> +<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): +## ignoring 'outlier' with fewer than 2 unique levels</code></pre> +<pre><code>## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning +## -Inf -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning +## -Inf -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning +## -Inf -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning +## -Inf -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning +## -Inf -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning +## -Inf -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning +## -Inf -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning +## -Inf -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning +## -Inf -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning +## -Inf -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning +## -Inf -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning +## -Inf -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning +## -Inf -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning +## -Inf</code></pre> +<div class="figure" style="text-align: center"><span id="fig:confound-find-pcs-total-features"></span> +<img src="confounders_files/figure-html/confound-find-pcs-total-features-1.png" alt="PC correlation with the number of detected genes" width="90%" /> +<p class="caption"> +Figure 7.20: PC correlation with the number of detected genes +</p> +</div> +<p>Indeed, we can see that <code>PC1</code> can be almost completely explained by <code>batch</code> and +<code>individual</code> (of course batch is nested within individual). The total counts +from ERCC spike-ins also explains a substantial proportion of the variability in +PC1.</p> +<p>Although number of detected genes is not strongly correlated with the PCs here +(after normalization), this is commonly the case and something to look out for. +[You might like to replicate the plot above using raw logcounts values to see +what happens without normalization]. This is a well-known issue in scRNA-seq and +was described <a href="http://biorxiv.org/content/early/2015/12/27/025528">here</a>.</p> +</div> +</div> +<div id="explanatory-variables" class="section level3"> +<h3><span class="header-section-number">7.4.3</span> Explanatory variables</h3> +<p><code>scater</code> can also compute the marginal <span class="math inline">\(R^2\)</span> for each variable when fitting a +linear model regressing expression values for each gene against just that +variable, and display a density plot of the gene-wise marginal <span class="math inline">\(R^2\)</span> values for +the variables.</p> +<div class="sourceCode" id="cb475"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb475-1" data-line-number="1"><span class="kw">plotExplanatoryVariables</span>(</a> +<a class="sourceLine" id="cb475-2" data-line-number="2"> umi.qc,</a> +<a class="sourceLine" id="cb475-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"logcounts_raw"</span>,</a> +<a class="sourceLine" id="cb475-4" data-line-number="4"> <span class="dt">variables =</span> <span class="kw">c</span>(</a> +<a class="sourceLine" id="cb475-5" data-line-number="5"> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb475-6" data-line-number="6"> <span class="st">"total_counts"</span>,</a> +<a class="sourceLine" id="cb475-7" data-line-number="7"> <span class="st">"batch"</span>,</a> +<a class="sourceLine" id="cb475-8" data-line-number="8"> <span class="st">"individual"</span>,</a> +<a class="sourceLine" id="cb475-9" data-line-number="9"> <span class="st">"pct_counts_ERCC"</span>,</a> +<a class="sourceLine" id="cb475-10" data-line-number="10"> <span class="st">"pct_counts_MT"</span></a> +<a class="sourceLine" id="cb475-11" data-line-number="11"> )</a> +<a class="sourceLine" id="cb475-12" data-line-number="12">)</a></code></pre></div> +<div class="figure" style="text-align: center"><span id="fig:confound-find-expl-vars"></span> +<img src="confounders_files/figure-html/confound-find-expl-vars-1.png" alt="Explanatory variables" width="90%" /> +<p class="caption"> +Figure 7.21: Explanatory variables +</p> +</div> +<p>This analysis indicates that the number of detected genes (again) and also the +sequencing depth (total number of UMI counts per cell) have substantial +explanatory power for many genes, so these variables are good candidates for +conditioning out in a normalization step, or including in downstream statistical +models [cf. <code>sctransform</code>’s approach to normalization]. Expression of ERCCs also +appears to be an important explanatory variable and one notable feature of the +above plot is that batch explains more than individual. What does that tell us +about the technical and biological variability of the data?</p> +</div> +<div id="other-confounders" class="section level3"> +<h3><span class="header-section-number">7.4.4</span> Other confounders</h3> +<p>In addition to correcting for batch, there are other factors that one may want +to compensate for. As with batch correction, these adjustments require extrinsic +information. One popular method is <a href="https://github.com/PMBio/scLVM">scLVM</a> which +allows you to identify and subtract the effect from processes such as cell-cycle +or apoptosis.</p> +<p>In addition, protocols may differ in terms of their coverage of each transcript, +their bias based on the average content of <strong>A/T</strong> nucleotides, or their ability +to capture short transcripts. Ideally, we would like to compensate for all of +these differences and biases.</p> +</div> +<div id="exercise-1" class="section level3"> +<h3><span class="header-section-number">7.4.5</span> Exercise</h3> +<p>Perform the same analysis with read counts of the Blischak data. Use +<code>tung/reads.rds</code> file to load the reads SCESet object. Once you have finished +please compare your results to ours (next chapter).</p> +</div> +<div id="sessioninfo-3" class="section level3"> +<h3><span class="header-section-number">7.4.6</span> sessionInfo()</h3> +<pre><code>## R version 3.6.0 (2019-04-26) +## Platform: x86_64-pc-linux-gnu (64-bit) +## Running under: Ubuntu 18.04.3 LTS +## +## Matrix products: default +## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 +## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 +## +## locale: +## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C +## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 +## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 +## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C +## [9] LC_ADDRESS=C LC_TELEPHONE=C +## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C +## +## attached base packages: +## [1] parallel stats4 stats graphics grDevices utils datasets +## [8] methods base +## +## other attached packages: +## [1] scran_1.12.1 scater_1.12.2 +## [3] ggplot2_3.2.1 SingleCellExperiment_1.6.0 +## [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 +## [7] BiocParallel_1.18.1 matrixStats_0.55.0 +## [9] Biobase_2.44.0 GenomicRanges_1.36.1 +## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 +## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 +## [15] knitr_1.25 +## +## loaded via a namespace (and not attached): +## [1] locfit_1.5-9.1 Rcpp_1.0.2 +## [3] rsvd_1.0.2 lattice_0.20-38 +## [5] assertthat_0.2.1 digest_0.6.21 +## [7] R6_2.4.0 dynamicTreeCut_1.63-1 +## [9] evaluate_0.14 highr_0.8 +## [11] pillar_1.4.2 zlibbioc_1.30.0 +## [13] rlang_0.4.0 lazyeval_0.2.2 +## [15] irlba_2.3.3 Matrix_1.2-17 +## [17] rmarkdown_1.15 labeling_0.3 +## [19] BiocNeighbors_1.2.0 statmod_1.4.32 +## [21] stringr_1.4.0 igraph_1.2.4.1 +## [23] RCurl_1.95-4.12 munsell_0.5.0 +## [25] compiler_3.6.0 vipor_0.4.5 +## [27] BiocSingular_1.0.0 xfun_0.9 +## [29] pkgconfig_2.0.3 ggbeeswarm_0.6.0 +## [31] htmltools_0.3.6 tidyselect_0.2.5 +## [33] tibble_2.1.3 gridExtra_2.3 +## [35] GenomeInfoDbData_1.2.1 bookdown_0.13 +## [37] edgeR_3.26.8 viridisLite_0.3.0 +## [39] crayon_1.3.4 dplyr_0.8.3 +## [41] withr_2.1.2 bitops_1.0-6 +## [43] grid_3.6.0 gtable_0.3.0 +## [45] magrittr_1.5 scales_1.0.0 +## [47] dqrng_0.2.1 stringi_1.4.3 +## [49] XVector_0.24.0 viridis_0.5.1 +## [51] limma_3.40.6 DelayedMatrixStats_1.6.1 +## [53] cowplot_1.0.0 tools_3.6.0 +## [55] glue_1.3.1 beeswarm_0.2.3 +## [57] purrr_0.3.2 yaml_2.2.0 +## [59] colorspace_1.4-1</code></pre> -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached - -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +</div> +</div> +<div id="identifying-confounding-factors-reads" class="section level2"> +<h2><span class="header-section-number">7.5</span> Identifying confounding factors (Reads)</h2> +<pre><code>## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its +## own size factors</code></pre> +<pre><code>## Warning in .get_all_sf_sets(object): spike-in set 'MT' should have its own +## size factors</code></pre> +<div class="figure" style="text-align: center"><span id="fig:confound-pca-reads"></span> +<img src="confounders-reads_files/figure-html/confound-pca-reads-1.png" alt="PCA plot of the tung data" width="90%" /> +<p class="caption"> +Figure 7.22: PCA plot of the tung data +</p> +</div> +<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): +## ignoring 'is_cell_control' with fewer than 2 unique levels</code></pre> +<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): +## ignoring 'pct_counts_in_top_100_features_feature_control' with fewer than 2 +## unique levels</code></pre> +<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): +## ignoring 'pct_counts_in_top_200_features_feature_control' with fewer than 2 +## unique levels</code></pre> +<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): +## ignoring 'pct_counts_in_top_500_features_feature_control' with fewer than 2 +## unique levels</code></pre> +<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): +## ignoring 'pct_counts_in_top_50_features_ERCC' with fewer than 2 unique +## levels</code></pre> +<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): +## ignoring 'pct_counts_in_top_100_features_ERCC' with fewer than 2 unique +## levels</code></pre> +<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): +## ignoring 'pct_counts_in_top_200_features_ERCC' with fewer than 2 unique +## levels</code></pre> +<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): +## ignoring 'pct_counts_in_top_500_features_ERCC' with fewer than 2 unique +## levels</code></pre> +<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): +## ignoring 'pct_counts_in_top_50_features_MT' with fewer than 2 unique levels</code></pre> +<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): +## ignoring 'pct_counts_in_top_100_features_MT' with fewer than 2 unique +## levels</code></pre> +<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): +## ignoring 'pct_counts_in_top_200_features_MT' with fewer than 2 unique +## levels</code></pre> +<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): +## ignoring 'pct_counts_in_top_500_features_MT' with fewer than 2 unique +## levels</code></pre> +<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): +## ignoring 'use' with fewer than 2 unique levels</code></pre> +<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): +## ignoring 'outlier' with fewer than 2 unique levels</code></pre> +<pre><code>## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning +## -Inf -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning +## -Inf -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning +## -Inf -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning +## -Inf -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning +## -Inf -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning +## -Inf -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning +## -Inf -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning +## -Inf -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning +## -Inf -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning +## -Inf -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning +## -Inf -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning +## -Inf -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning +## -Inf -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning +## -Inf</code></pre> +<div class="figure" style="text-align: center"><span id="fig:confound-find-pcs-reads"></span> +<img src="confounders-reads_files/figure-html/confound-find-pcs-reads-1.png" alt="PC correlation with the number of detected genes" width="90%" /> +<p class="caption"> +Figure 7.23: PC correlation with the number of detected genes +</p> +</div> +<div class="figure" style="text-align: center"><span id="fig:confound-find-expl-vars-reads"></span> +<img src="confounders-reads_files/figure-html/confound-find-expl-vars-reads-1.png" alt="Explanatory variables" width="90%" /> +<p class="caption"> +Figure 7.24: Explanatory variables +</p> +</div> +<pre><code>## R version 3.6.0 (2019-04-26) +## Platform: x86_64-pc-linux-gnu (64-bit) +## Running under: Ubuntu 18.04.3 LTS +## +## Matrix products: default +## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 +## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 +## +## locale: +## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C +## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 +## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 +## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C +## [9] LC_ADDRESS=C LC_TELEPHONE=C +## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C +## +## attached base packages: +## [1] parallel stats4 stats graphics grDevices utils datasets +## [8] methods base +## +## other attached packages: +## [1] scran_1.12.1 scater_1.12.2 +## [3] ggplot2_3.2.1 SingleCellExperiment_1.6.0 +## [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 +## [7] BiocParallel_1.18.1 matrixStats_0.55.0 +## [9] Biobase_2.44.0 GenomicRanges_1.36.1 +## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 +## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 +## +## loaded via a namespace (and not attached): +## [1] locfit_1.5-9.1 Rcpp_1.0.2 +## [3] rsvd_1.0.2 lattice_0.20-38 +## [5] assertthat_0.2.1 digest_0.6.21 +## [7] R6_2.4.0 dynamicTreeCut_1.63-1 +## [9] evaluate_0.14 highr_0.8 +## [11] pillar_1.4.2 zlibbioc_1.30.0 +## [13] rlang_0.4.0 lazyeval_0.2.2 +## [15] irlba_2.3.3 Matrix_1.2-17 +## [17] rmarkdown_1.15 labeling_0.3 +## [19] BiocNeighbors_1.2.0 statmod_1.4.32 +## [21] stringr_1.4.0 igraph_1.2.4.1 +## [23] RCurl_1.95-4.12 munsell_0.5.0 +## [25] compiler_3.6.0 vipor_0.4.5 +## [27] BiocSingular_1.0.0 xfun_0.9 +## [29] pkgconfig_2.0.3 ggbeeswarm_0.6.0 +## [31] htmltools_0.3.6 tidyselect_0.2.5 +## [33] tibble_2.1.3 gridExtra_2.3 +## [35] GenomeInfoDbData_1.2.1 bookdown_0.13 +## [37] edgeR_3.26.8 viridisLite_0.3.0 +## [39] crayon_1.3.4 dplyr_0.8.3 +## [41] withr_2.1.2 bitops_1.0-6 +## [43] grid_3.6.0 gtable_0.3.0 +## [45] magrittr_1.5 scales_1.0.0 +## [47] dqrng_0.2.1 stringi_1.4.3 +## [49] XVector_0.24.0 viridis_0.5.1 +## [51] limma_3.40.6 DelayedMatrixStats_1.6.1 +## [53] cowplot_1.0.0 tools_3.6.0 +## [55] glue_1.3.1 beeswarm_0.2.3 +## [57] purrr_0.3.2 yaml_2.2.0 +## [59] colorspace_1.4-1 knitr_1.25</code></pre> -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +</div> +<div id="batch-effects" class="section level2"> +<h2><span class="header-section-number">7.6</span> Batch effects</h2> +<div id="introduction-6" class="section level3"> +<h3><span class="header-section-number">7.6.1</span> Introduction</h3> +<p>In the previous chapter we normalized for library size, effectively removing it +as a confounder. Now we will consider removing other less well defined +confounders from our data. Technical confounders (aka batch effects) can arise +from difference in reagents, isolation methods, the lab/experimenter who +performed the experiment, even which day/time the experiment was performed. +Accounting for technical confounders, and batch effects particularly, is a large +topic that also involves principles of experimental design. Here we address +approaches that can be taken to account for confounders when the experimental +design is appropriate.</p> +<p>Fundamentally, accounting for technical confounders involves identifying and, +ideally, removing sources of variation in the expression data that are not +related to (i.e. are confounding) the biological signal of interest. Various +approaches exist, some of which use spike-in or housekeeping genes, and some of +which use endogenous genes.</p> +<div id="advantages-and-disadvantages-of-using-spike-ins-to-remove-confounders" class="section level4"> +<h4><span class="header-section-number">7.6.1.1</span> Advantages and disadvantages of using spike-ins to remove confounders</h4> +<p>The use of spike-ins as control genes is conceptually appealing, since (ideally) +the same amount of ERCC (or other) spike-in would be added to each cell in our +experiment. In principle, all the variability we observe for these ``genes’’ is +due to technical noise; whereas endogenous genes are affected by both technical +noise and biological variability. Technical noise can be removed by fitting a +model to the spike-ins and “substracting†this from the endogenous genes. There +are several methods available based on this premise (eg. +<a href="https://github.com/catavallejos/BASiCS">BASiCS</a>, +<a href="https://github.com/PMBio/scLVM">scLVM</a>, +<a href="http://bioconductor.org/packages/release/bioc/html/RUVSeq.html">RUVg</a>); each +using different noise models and different fitting procedures. Alternatively, +one can identify genes which exhibit significant variation beyond technical +noise (eg. Distance to median, <a href="http://www.nature.com/nmeth/journal/v10/n11/full/nmeth.2645.html">Highly variable +genes</a>).</p> +<p>Unfortunately, there are major issues with the use of spike-ins for +normalisation that limit their utility in practice. Perhaps surprisingly, their +variability can, for various reasons, actually be <em>higher</em> than that of +endogenous genes. One key reason for the difficulty of their use in practice is +the need to pipette miniscule volumes of spike-in solution into The most popular +set of spike-ins, namely ERCCs, are derived from bacterial sequences, which +raises concerns that their base content and structure diverges to far from gene +structure in other biological systems of interest (e.g. mammalian genes) to be +reliable for normalisation. Even in the best-case scenarios, spike-ins are +limited to use on plate-based platforms; they are fundamentally incompatible +with droplet-based platforms.</p> +<p>Given the issues with using spike-ins, better results can often be obtained by +using endogenous genes instead. Given their limited availability, normalisation +methods based only on endogenous genes needed to be developed and we consider +them generally preferable, even for platforms where spike-ins may be used. Where +we have a large number of endogenous genes that, on average, do not vary +systematically between cells and where we expect technical effects to affect a +large number of genes (a very common and reasonable assumption), then such +methods (for example, the RUVs method) can perform well.</p> +<p>We explore both general approaches below.</p> +<div class="sourceCode" id="cb495"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb495-1" data-line-number="1"><span class="kw">library</span>(scRNA.seq.funcs)</a> +<a class="sourceLine" id="cb495-2" data-line-number="2"><span class="kw">library</span>(RUVSeq)</a> +<a class="sourceLine" id="cb495-3" data-line-number="3"><span class="kw">library</span>(scater)</a> +<a class="sourceLine" id="cb495-4" data-line-number="4"><span class="kw">library</span>(SingleCellExperiment)</a> +<a class="sourceLine" id="cb495-5" data-line-number="5"><span class="kw">library</span>(scran)</a> +<a class="sourceLine" id="cb495-6" data-line-number="6"><span class="kw">library</span>(kBET)</a> +<a class="sourceLine" id="cb495-7" data-line-number="7"><span class="kw">library</span>(sva) <span class="co"># Combat</span></a> +<a class="sourceLine" id="cb495-8" data-line-number="8"><span class="kw">library</span>(edgeR)</a> +<a class="sourceLine" id="cb495-9" data-line-number="9"><span class="kw">library</span>(harmony)</a> +<a class="sourceLine" id="cb495-10" data-line-number="10"><span class="kw">set.seed</span>(<span class="dv">1234567</span>)</a> +<a class="sourceLine" id="cb495-11" data-line-number="11"><span class="kw">options</span>(<span class="dt">stringsAsFactors =</span> <span class="ot">FALSE</span>)</a> +<a class="sourceLine" id="cb495-12" data-line-number="12">umi <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/tung/umi.rds"</span>)</a> +<a class="sourceLine" id="cb495-13" data-line-number="13">umi.qc <-<span class="st"> </span>umi[<span class="kw">rowData</span>(umi)<span class="op">$</span>use, <span class="kw">colData</span>(umi)<span class="op">$</span>use]</a> +<a class="sourceLine" id="cb495-14" data-line-number="14">endog_genes <-<span class="st"> </span><span class="op">!</span><span class="kw">rowData</span>(umi.qc)<span class="op">$</span>is_feature_control</a> +<a class="sourceLine" id="cb495-15" data-line-number="15">erccs <-<span class="st"> </span><span class="kw">rowData</span>(umi.qc)<span class="op">$</span>is_feature_control</a> +<a class="sourceLine" id="cb495-16" data-line-number="16"><span class="co">## Apply scran sum factor normalization</span></a> +<a class="sourceLine" id="cb495-17" data-line-number="17">qclust <-<span class="st"> </span><span class="kw">quickCluster</span>(umi.qc, <span class="dt">min.size =</span> <span class="dv">30</span>, <span class="dt">use.ranks =</span> <span class="ot">FALSE</span>)</a> +<a class="sourceLine" id="cb495-18" data-line-number="18">umi.qc <-<span class="st"> </span><span class="kw">computeSumFactors</span>(umi.qc, <span class="dt">sizes =</span> <span class="dv">15</span>, <span class="dt">clusters =</span> qclust)</a> +<a class="sourceLine" id="cb495-19" data-line-number="19">umi.qc <-<span class="st"> </span><span class="kw">normalize</span>(umi.qc)</a></code></pre></div> +</div> +</div> +<div id="linear-models" class="section level3"> +<h3><span class="header-section-number">7.6.2</span> Linear models</h3> +<p>Linear models offer a relatively simple approach to accounting for batch effects +and confounders. A linear model can correct for batches while preserving +biological effects if you have a balanced design. In a confounded/replicate +design biological effects will not be fit/preserved. We could remove batch +effects from each individual separately in order to preserve biological (and +technical) variance between individuals (we will apply a similar with +<code>mnnCorrect</code>, below).</p> +<p>Depending on how we have pre-processed our scRNA-seq data or what modelling +assumptions we are willing to make, we may choose to use normal (Gaussian) +linear models (i.e. assuming a normal distribution for noise) or generalized +linear models (GLM), where we can use any distribution from the exponential +family. Given that we obtain highly-variable count data from scRNA-seq assays, +the obvious choice for a GLM is to use the negative binomial distribution, which +has proven highly successful in the analysis of bulk RNA-seq data.</p> +<p>For demonstration purposes here we will naively correct all confounded batch +effects.</p> +<div id="gaussian-normal-linear-models" class="section level4"> +<h4><span class="header-section-number">7.6.2.1</span> Gaussian (normal) linear models</h4> +<p>The <a href="https://bioconductor.org/packages/release/bioc/html/limma.html"><code>limma</code></a> +package in Bioconductor offers a convenient and efficient means to fit a linear +model (with the same design matrix) to a dataset with a large number of features +(i.e. genes) <span class="citation">(Ritchie et al. <a href="#ref-Ritchie2015-ra">2015</a>)</span>. An added advantage of <code>limma</code> is its ability to +apply empirical Bayes squeezing of variance estimate to improve inference.</p> +<p>Provided we are satisfied making the assumption of a Gaussian distribution for +residuals (this may be reasonable for normalized log-counts in many cases; but +it may not be—debate continues in the literature), then we can apply <code>limma</code> +to regress out (known) unwanted sources of variation as follows.</p> +<div class="sourceCode" id="cb496"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb496-1" data-line-number="1"><span class="co">## fit a model just accounting for batch</span></a> +<a class="sourceLine" id="cb496-2" data-line-number="2">lm_design_batch <-<span class="st"> </span><span class="kw">model.matrix</span>(<span class="op">~</span><span class="dv">0</span> <span class="op">+</span><span class="st"> </span>batch, <span class="dt">data =</span> <span class="kw">colData</span>(umi.qc))</a> +<a class="sourceLine" id="cb496-3" data-line-number="3">fit_lm_batch <-<span class="st"> </span><span class="kw">lmFit</span>(<span class="kw">logcounts</span>(umi.qc), lm_design_batch)</a> +<a class="sourceLine" id="cb496-4" data-line-number="4">resids_lm_batch <-<span class="st"> </span><span class="kw">residuals</span>(fit_lm_batch, <span class="kw">logcounts</span>(umi.qc))</a> +<a class="sourceLine" id="cb496-5" data-line-number="5"><span class="kw">assay</span>(umi.qc, <span class="st">"lm_batch"</span>) <-<span class="st"> </span>resids_lm_batch</a> +<a class="sourceLine" id="cb496-6" data-line-number="6"></a> +<a class="sourceLine" id="cb496-7" data-line-number="7"><span class="kw">reducedDim</span>(umi.qc, <span class="st">"PCA_lm_batch"</span>) <-<span class="st"> </span><span class="kw">reducedDim</span>(</a> +<a class="sourceLine" id="cb496-8" data-line-number="8"> <span class="kw">runPCA</span>(umi.qc[endog_genes, ], <span class="dt">exprs_values =</span> <span class="st">"lm_batch"</span>), <span class="st">"PCA"</span>)</a> +<a class="sourceLine" id="cb496-9" data-line-number="9"></a> +<a class="sourceLine" id="cb496-10" data-line-number="10"><span class="kw">plotReducedDim</span>(umi.qc, <span class="dt">use_dimred =</span> <span class="st">"PCA_lm_batch"</span>,</a> +<a class="sourceLine" id="cb496-11" data-line-number="11"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>, </a> +<a class="sourceLine" id="cb496-12" data-line-number="12"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb496-13" data-line-number="13"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> +<a class="sourceLine" id="cb496-14" data-line-number="14"> ) <span class="op">+</span></a> +<a class="sourceLine" id="cb496-15" data-line-number="15"><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"LM - regress out batch"</span>)</a></code></pre></div> +<p>Two problems are immediately apparent with the approach above. First, batch is +nested within individual, so simply regressing out batch as we have done above +also regresses out differences between individuals that we would like to +preserve. Second, we observe that the first principal component seems to +separate cells by number of genes (features) expressed, which is undesirable.</p> +<p>We can address these concerns by correcting for batch within each individual +separately, and also fitting the proportion of genes expressed per cell as a +covariate. [NB: to preserve overall differences in expression levels between +individuals we will need to apply a slight hack to the LM fit results (setting +the intercept coefficient to zero).]</p> +<p><strong>Exercise 2</strong></p> +<p>Perform LM correction for each individual separately. Store the final corrected +matrix in the <code>lm_batch_indi</code> slot.</p> +<div class="sourceCode" id="cb497"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb497-1" data-line-number="1"><span class="co">## define cellular detection rate (cdr), i.e. proportion of genes expressed in each cell</span></a> +<a class="sourceLine" id="cb497-2" data-line-number="2">umi.qc<span class="op">$</span>cdr <-<span class="st"> </span>umi.qc<span class="op">$</span>total_features_by_counts_endogenous <span class="op">/</span><span class="st"> </span><span class="kw">nrow</span>(umi.qc)</a> +<a class="sourceLine" id="cb497-3" data-line-number="3"><span class="co">## fit a model just accounting for batch by individual</span></a> +<a class="sourceLine" id="cb497-4" data-line-number="4">lm_design_batch1 <-<span class="st"> </span><span class="kw">model.matrix</span>(<span class="op">~</span>batch <span class="op">+</span><span class="st"> </span>cdr,</a> +<a class="sourceLine" id="cb497-5" data-line-number="5"> <span class="dt">data =</span> <span class="kw">colData</span>(umi.qc)[umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "na19098"</span>,])</a> +<a class="sourceLine" id="cb497-6" data-line-number="6">fit_indi1 <-<span class="st"> </span><span class="kw">lmfit</span>(<span class="kw">logcounts</span>(umi.qc)[, umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "na19098"</span>], lm_design_batch1)</a> +<a class="sourceLine" id="cb497-7" data-line-number="7">fit_indi1<span class="op">$</span>coefficients[,<span class="dv">1</span>] <-<span class="st"> </span><span class="dv">0</span> <span class="co">## replace intercept with 0 to preserve reference batch</span></a> +<a class="sourceLine" id="cb497-8" data-line-number="8">resids_lm_batch1 <-<span class="st"> </span><span class="kw">residuals</span>(fit_indi1, <span class="kw">logcounts</span>(umi.qc)[, umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "na19098"</span>])</a> +<a class="sourceLine" id="cb497-9" data-line-number="9"></a> +<a class="sourceLine" id="cb497-10" data-line-number="10">lm_design_batch2 <-<span class="st"> </span><span class="kw">model.matrix</span>(<span class="op">~</span>batch <span class="op">+</span><span class="st"> </span>cdr,</a> +<a class="sourceLine" id="cb497-11" data-line-number="11"> <span class="dt">data =</span> <span class="kw">colData</span>(umi.qc)[umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "na19101"</span>,])</a> +<a class="sourceLine" id="cb497-12" data-line-number="12">fit_indi2 <-<span class="st"> </span><span class="kw">lmfit</span>(<span class="kw">logcounts</span>(umi.qc)[, umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "na19101"</span>], lm_design_batch2)</a> +<a class="sourceLine" id="cb497-13" data-line-number="13">fit_indi2<span class="op">$</span>coefficients[,<span class="dv">1</span>] <-<span class="st"> </span><span class="dv">0</span> <span class="co">## replace intercept with 0 to preserve reference batch</span></a> +<a class="sourceLine" id="cb497-14" data-line-number="14">resids_lm_batch2 <-<span class="st"> </span><span class="kw">residuals</span>(fit_indi2, <span class="kw">logcounts</span>(umi.qc)[, umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "na19101"</span>])</a> +<a class="sourceLine" id="cb497-15" data-line-number="15"></a> +<a class="sourceLine" id="cb497-16" data-line-number="16">lm_design_batch3 <-<span class="st"> </span><span class="kw">model.matrix</span>(<span class="op">~</span>batch <span class="op">+</span><span class="st"> </span>cdr,</a> +<a class="sourceLine" id="cb497-17" data-line-number="17"> <span class="dt">data =</span> <span class="kw">colData</span>(umi.qc)[umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "na19239"</span>,])</a> +<a class="sourceLine" id="cb497-18" data-line-number="18">fit_indi3 <-<span class="st"> </span><span class="kw">lmfit</span>(<span class="kw">logcounts</span>(umi.qc)[, umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "na19239"</span>], lm_design_batch3)</a> +<a class="sourceLine" id="cb497-19" data-line-number="19">fit_indi3<span class="op">$</span>coefficients[,<span class="dv">1</span>] <-<span class="st"> </span><span class="dv">0</span> <span class="co">## replace intercept with 0 to preserve reference batch</span></a> +<a class="sourceLine" id="cb497-20" data-line-number="20">resids_lm_batch3 <-<span class="st"> </span><span class="kw">residuals</span>(fit_indi3, <span class="kw">logcounts</span>(umi.qc)[, umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "na19239"</span>])</a> +<a class="sourceLine" id="cb497-21" data-line-number="21"></a> +<a class="sourceLine" id="cb497-22" data-line-number="22"><span class="kw">identical</span>(<span class="kw">colnames</span>(umi.qc), <span class="kw">colnames</span>(<span class="kw">cbind</span>(resids_lm_batch1, resids_lm_batch2, resids_lm_batch3)))</a> +<a class="sourceLine" id="cb497-23" data-line-number="23"><span class="kw">assay</span>(umi.qc, <span class="st">"lm_batch_indi"</span>) <-<span class="st"> </span><span class="kw">cbind</span>(resids_lm_batch1, resids_lm_batch2, resids_lm_batch3)</a> +<a class="sourceLine" id="cb497-24" data-line-number="24"></a> +<a class="sourceLine" id="cb497-25" data-line-number="25"><span class="kw">reduceddim</span>(umi.qc, <span class="st">"pca_lm_batch_indi"</span>) <-<span class="st"> </span><span class="kw">reduceddim</span>(</a> +<a class="sourceLine" id="cb497-26" data-line-number="26"> <span class="kw">runpca</span>(umi.qc[endog_genes, ], <span class="dt">exprs_values =</span> <span class="st">"lm_batch_indi"</span>), <span class="st">"pca"</span>)</a> +<a class="sourceLine" id="cb497-27" data-line-number="27"></a> +<a class="sourceLine" id="cb497-28" data-line-number="28"><span class="kw">plotreduceddim</span>(umi.qc, <span class="dt">use_dimred =</span> <span class="st">"pca_lm_batch_indi"</span>,</a> +<a class="sourceLine" id="cb497-29" data-line-number="29"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>, </a> +<a class="sourceLine" id="cb497-30" data-line-number="30"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb497-31" data-line-number="31"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> +<a class="sourceLine" id="cb497-32" data-line-number="32"> ) <span class="op">+</span></a> +<a class="sourceLine" id="cb497-33" data-line-number="33"><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"lm - regress out batch within individuals separately"</span>)</a></code></pre></div> +<p>What do you think of the results of this approach?</p> +</div> +<div id="negative-binomial-generalized-linear-models" class="section level4"> +<h4><span class="header-section-number">7.6.2.2</span> Negative binomial generalized linear models</h4> +</div> +</div> +<div id="sctransform-2" class="section level3"> +<h3><span class="header-section-number">7.6.3</span> sctransform</h3> +<p>The <code>sctransform</code> approach to using Pearson residuals from an regularized +negative binomial generalized linear model was introduced above. Here we +demonstrate how to apply this method.</p> +<p>Note that (due to what looks like a bug in this version of <code>sctransform</code>) we +need to convert the UMI count matrix to a sparse format to apply sctransform.</p> +<p>These <code>sctransform</code> results will face the problem mentioned above of batch being +nested within individual, which means that we cannot directly remove batch +effects without removing differences between individuals. However, here we will +demonstrate how you <em>would</em> try to remove batch effects with <code>sctransform</code> for a +kinder experimental design.</p> +<div class="sourceCode" id="cb498"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb498-1" data-line-number="1">umi_sparse <-<span class="st"> </span><span class="kw">as</span>(<span class="kw">counts</span>(umi.qc), <span class="st">"dgCMatrix"</span>)</a> +<a class="sourceLine" id="cb498-2" data-line-number="2"><span class="co">### Genes expressed in at least 5 cells will be kept</span></a> +<a class="sourceLine" id="cb498-3" data-line-number="3">sctnorm_data <-<span class="st"> </span>sctransform<span class="op">::</span><span class="kw">vst</span>(<span class="dt">umi =</span> umi_sparse, <span class="dt">min_cells =</span> <span class="dv">1</span>,</a> +<a class="sourceLine" id="cb498-4" data-line-number="4"> <span class="dt">cell_attr =</span> <span class="kw">as.data.frame</span>(<span class="kw">colData</span>(umi.qc)),</a> +<a class="sourceLine" id="cb498-5" data-line-number="5"> <span class="dt">latent_var =</span> <span class="kw">c</span>(<span class="st">"log10_total_counts_endogenous"</span>, <span class="st">"batch"</span>))</a></code></pre></div> +<pre><code>## Calculating cell attributes for input UMI matrix</code></pre> +<pre><code>## Variance stabilizing transformation of count matrix of size 14066 by 657</code></pre> +<pre><code>## Model formula is y ~ log10_total_counts_endogenous + batch</code></pre> +<pre><code>## Get Negative Binomial regression parameters per gene</code></pre> +<pre><code>## Using 2000 genes, 657 cells</code></pre> +<pre><code>## + | + | | 0%</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached @@ -1641,11 +2537,11 @@ need to convert the UMI count matrix to a sparse format to apply sctransform.</p ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## - | - |================ | 25%</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached @@ -1707,6 +2603,10 @@ need to convert the UMI count matrix to a sparse format to apply sctransform.</p ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached @@ -1805,6 +2705,10 @@ need to convert the UMI count matrix to a sparse format to apply sctransform.</p ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached @@ -1987,6 +2891,12 @@ need to convert the UMI count matrix to a sparse format to apply sctransform.</p ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## + | + |======== | 12%</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached @@ -2021,11 +2931,51 @@ need to convert the UMI count matrix to a sparse format to apply sctransform.</p ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## - | - |======================== | 38%</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached @@ -2392,7 +3342,7 @@ need to convert the UMI count matrix to a sparse format to apply sctransform.</p ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> <pre><code>## | - |================================ | 50%</code></pre> + |================ | 25%</code></pre> <pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached @@ -2749,11 +3699,7 @@ need to convert the UMI count matrix to a sparse format to apply sctransform.</p ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## - | - |========================================= | 62%</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached @@ -2861,6 +3807,12 @@ need to convert the UMI count matrix to a sparse format to apply sctransform.</p ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## + | + |======================== | 38%</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached @@ -3139,11 +4091,7 @@ need to convert the UMI count matrix to a sparse format to apply sctransform.</p ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## - | - |================================================= | 75%</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached @@ -3329,6 +4277,12 @@ need to convert the UMI count matrix to a sparse format to apply sctransform.</p ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## + | + |================================ | 50%</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached @@ -3547,11 +4501,7 @@ need to convert the UMI count matrix to a sparse format to apply sctransform.</p ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## - | - |========================================================= | 88%</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached @@ -3781,6 +4731,12 @@ need to convert the UMI count matrix to a sparse format to apply sctransform.</p ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## + | + |========================================= | 62%</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached @@ -3863,527 +4819,905 @@ need to convert the UMI count matrix to a sparse format to apply sctransform.</p ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## - | - |=================================================================| 100%</code></pre> -<pre><code>## Found 5 outliers - those will be ignored in fitting/regularization step</code></pre> -<pre><code>## Second step: Get residuals using fitted parameters for 14066 genes</code></pre> -<pre><code>## - | - | | 0% - | - |= | 2% - | - |== | 4% - | - |==== | 5% - | - |===== | 7% - | - |====== | 9% - | - |======= | 11% - | - |======== | 13% - | - |========= | 15% - | - |=========== | 16% - | - |============ | 18% - | - |============= | 20% - | - |============== | 22% - | - |=============== | 24% - | - |================= | 25% - | - |================== | 27% - | - |=================== | 29% - | - |==================== | 31% - | - |===================== | 33% - | - |====================== | 35% - | - |======================== | 36% - | - |========================= | 38% - | - |========================== | 40% - | - |=========================== | 42% - | - |============================ | 44% - | - |============================== | 45% - | - |=============================== | 47% - | - |================================ | 49% - | - |================================= | 51% - | - |================================== | 53% - | - |=================================== | 55% - | - |===================================== | 56% - | - |====================================== | 58% - | - |======================================= | 60% - | - |======================================== | 62% - | - |========================================= | 64% - | - |=========================================== | 65% - | - |============================================ | 67% - | - |============================================= | 69% - | - |============================================== | 71% - | - |=============================================== | 73% - | - |================================================ | 75% - | - |================================================== | 76% - | - |=================================================== | 78% - | - |==================================================== | 80% - | - |===================================================== | 82% - | - |====================================================== | 84% - | - |======================================================== | 85% - | - |========================================================= | 87% - | - |========================================================== | 89% - | - |=========================================================== | 91% - | - |============================================================ | 93% - | - |============================================================= | 95% - | - |=============================================================== | 96% - | - |================================================================ | 98% - | - |=================================================================| 100%</code></pre> -<pre><code>## Calculating gene attributes</code></pre> -<pre><code>## Wall clock passed: Time difference of 21.43582 secs</code></pre> -<div class="sourceCode" id="cb343"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb343-1" data-line-number="1"><span class="co">## Pearson residuals, or deviance residuals</span></a> -<a class="sourceLine" id="cb343-2" data-line-number="2"><span class="kw">dim</span>(sctnorm_data<span class="op">$</span>y)</a></code></pre></div> -<pre><code>## [1] 14066 657</code></pre> -<div class="sourceCode" id="cb345"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb345-1" data-line-number="1"><span class="kw">dim</span>(umi.qc)</a></code></pre></div> -<pre><code>## [1] 14066 657</code></pre> -<div class="sourceCode" id="cb347"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb347-1" data-line-number="1">sctnorm_data<span class="op">$</span>model_str</a></code></pre></div> -<pre><code>## [1] "y ~ log10_total_counts_endogenous"</code></pre> -<div class="sourceCode" id="cb349"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb349-1" data-line-number="1"><span class="kw">assay</span>(umi.qc, <span class="st">"sctrans_norm"</span>) <-<span class="st"> </span>sctnorm_data<span class="op">$</span>y</a></code></pre></div> -<p>Let us look at the NB GLM model parameters estimated by sctransform.</p> -<div class="sourceCode" id="cb350"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb350-1" data-line-number="1"><span class="co">#sce$log10_total_counts</span></a> -<a class="sourceLine" id="cb350-2" data-line-number="2"><span class="co">## Matrix of estimated model parameters per gene (theta and regression coefficients)</span></a> -<a class="sourceLine" id="cb350-3" data-line-number="3">sctransform<span class="op">::</span><span class="kw">plot_model_pars</span>(sctnorm_data)</a></code></pre></div> -<p><img src="exprs-norm_files/figure-html/sctransform-params-plot-1.png" width="90%" style="display: block; margin: auto;" /></p> -<p>We can look at the effect of sctransform’s normalization on three particular -genes, ACTB, POU5F1 (aka OCT4) and CD74.</p> -<div class="sourceCode" id="cb351"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb351-1" data-line-number="1"><span class="co">##c('ACTB', 'Rpl10', 'Cd74')</span></a> -<a class="sourceLine" id="cb351-2" data-line-number="2">genes_plot <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"ENSG00000075624"</span>, <span class="st">"ENSG00000204531"</span>, <span class="st">"ENSG00000019582"</span>)</a> -<a class="sourceLine" id="cb351-3" data-line-number="3">sctransform<span class="op">::</span><span class="kw">plot_model</span>(sctnorm_data, umi_sparse, genes_plot, </a> -<a class="sourceLine" id="cb351-4" data-line-number="4"> <span class="dt">plot_residual =</span> <span class="ot">TRUE</span>, <span class="dt">cell_attr =</span> <span class="kw">as.data.frame</span>(<span class="kw">colData</span>(umi.qc)))</a></code></pre></div> -<p><img src="exprs-norm_files/figure-html/sctransform-genes-plot-1.png" width="90%" style="display: block; margin: auto;" /></p> -<div class="sourceCode" id="cb352"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb352-1" data-line-number="1"><span class="kw">reducedDim</span>(umi.qc, <span class="st">"PCA_sctrans_norm"</span>) <-<span class="st"> </span><span class="kw">reducedDim</span>(</a> -<a class="sourceLine" id="cb352-2" data-line-number="2"> <span class="kw">runPCA</span>(umi.qc[endog_genes, ], <span class="dt">exprs_values =</span> <span class="st">"sctrans_norm"</span>)</a> -<a class="sourceLine" id="cb352-3" data-line-number="3">)</a> -<a class="sourceLine" id="cb352-4" data-line-number="4"><span class="kw">plotReducedDim</span>(</a> -<a class="sourceLine" id="cb352-5" data-line-number="5"> umi.qc,</a> -<a class="sourceLine" id="cb352-6" data-line-number="6"> <span class="dt">use_dimred =</span> <span class="st">"PCA_sctrans_norm"</span>,</a> -<a class="sourceLine" id="cb352-7" data-line-number="7"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> -<a class="sourceLine" id="cb352-8" data-line-number="8"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb352-9" data-line-number="9"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> -<a class="sourceLine" id="cb352-10" data-line-number="10">) <span class="op">+</span><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"PCA plot: sctransform normalization"</span>) </a></code></pre></div> -<div class="figure" style="text-align: center"><span id="fig:norm-pca-sctransform"></span> -<img src="exprs-norm_files/figure-html/norm-pca-sctransform-1.png" alt="PCA plot of the tung data after sctransform normalisation (Pearson residuals)." width="90%" /> -<p class="caption"> -Figure 7.7: PCA plot of the tung data after sctransform normalisation (Pearson residuals). -</p> -</div> -<div class="sourceCode" id="cb353"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb353-1" data-line-number="1"><span class="kw">plotRLE</span>(</a> -<a class="sourceLine" id="cb353-2" data-line-number="2"> umi.qc[endog_genes, ], </a> -<a class="sourceLine" id="cb353-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"sctrans_norm"</span>,</a> -<a class="sourceLine" id="cb353-4" data-line-number="4"> <span class="dt">colour_by =</span> <span class="st">"batch"</span></a> -<a class="sourceLine" id="cb353-5" data-line-number="5">) <span class="op">+</span><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"RLE plot: sctransform normalization"</span>)</a></code></pre></div> -<div class="figure" style="text-align: center"><span id="fig:norm-ours-rle-sctransform"></span> -<img src="exprs-norm_files/figure-html/norm-ours-rle-sctransform-1.png" alt="Cell-wise RLE of the tung data" width="90%" /> -<p class="caption"> -Figure 7.8: Cell-wise RLE of the tung data -</p> -</div> -</div> -<div id="normalisation-for-genetranscript-length" class="section level3"> -<h3><span class="header-section-number">7.2.5</span> Normalisation for gene/transcript length</h3> -<p>Some methods combine library size and fragment/gene length normalization such -as:</p> -<ul> -<li><strong>RPKM</strong> - Reads Per Kilobase Million (for single-end sequencing)</li> -<li><strong>FPKM</strong> - Fragments Per Kilobase Million (same as <strong>RPKM</strong> but for paired-end -sequencing, makes sure that paired ends mapped to the same fragment are not -counted twice)</li> -<li><strong>TPM</strong> - Transcripts Per Kilobase Million (same as <strong>RPKM</strong>, but the order of -normalizations is reversed - length first and sequencing depth second)</li> -</ul> -<p>These methods are not applicable to our dataset since the end -of the transcript which contains the UMI was preferentially -sequenced. Furthermore in general these should only be calculated -using appropriate quantification software from aligned BAM files not -from read counts since often only a portion of the entire -gene/transcript is sequenced, not the entire length. If in doubt check -for a relationship between gene/transcript length and expression level.</p> -<p>However, here we show how these normalisations can be calculated using <code>scater</code>. -First, we need to find the effective transcript length in Kilobases. However, -our dataset containes only gene IDs, therefore we will be using the gene lengths -instead of transcripts. <code>scater</code> uses the -<a href="https://bioconductor.org/packages/release/bioc/html/biomaRt.html">biomaRt</a> -package, which allows one to annotate genes by other attributes:</p> -<div class="sourceCode" id="cb354"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb354-1" data-line-number="1">umi.qc <-<span class="st"> </span><span class="kw">getBMFeatureAnnos</span>(</a> -<a class="sourceLine" id="cb354-2" data-line-number="2"> umi.qc,</a> -<a class="sourceLine" id="cb354-3" data-line-number="3"> <span class="dt">filters =</span> <span class="st">"ensembl_gene_id"</span>, </a> -<a class="sourceLine" id="cb354-4" data-line-number="4"> <span class="dt">attributes =</span> <span class="kw">c</span>(</a> -<a class="sourceLine" id="cb354-5" data-line-number="5"> <span class="st">"ensembl_gene_id"</span>,</a> -<a class="sourceLine" id="cb354-6" data-line-number="6"> <span class="st">"hgnc_symbol"</span>,</a> -<a class="sourceLine" id="cb354-7" data-line-number="7"> <span class="st">"chromosome_name"</span>,</a> -<a class="sourceLine" id="cb354-8" data-line-number="8"> <span class="st">"start_position"</span>,</a> -<a class="sourceLine" id="cb354-9" data-line-number="9"> <span class="st">"end_position"</span></a> -<a class="sourceLine" id="cb354-10" data-line-number="10"> ), </a> -<a class="sourceLine" id="cb354-11" data-line-number="11"> <span class="dt">biomart =</span> <span class="st">"ENSEMBL_MART_ENSEMBL"</span>, </a> -<a class="sourceLine" id="cb354-12" data-line-number="12"> <span class="dt">dataset =</span> <span class="st">"hsapiens_gene_ensembl"</span>,</a> -<a class="sourceLine" id="cb354-13" data-line-number="13"> <span class="dt">host =</span> <span class="st">"www.ensembl.org"</span></a> -<a class="sourceLine" id="cb354-14" data-line-number="14">)</a> -<a class="sourceLine" id="cb354-15" data-line-number="15"></a> -<a class="sourceLine" id="cb354-16" data-line-number="16"><span class="co"># If you have mouse data, change the arguments based on this example:</span></a> -<a class="sourceLine" id="cb354-17" data-line-number="17"><span class="co"># getBMFeatureAnnos(</span></a> -<a class="sourceLine" id="cb354-18" data-line-number="18"><span class="co"># object,</span></a> -<a class="sourceLine" id="cb354-19" data-line-number="19"><span class="co"># filters = "ensembl_transcript_id",</span></a> -<a class="sourceLine" id="cb354-20" data-line-number="20"><span class="co"># attributes = c(</span></a> -<a class="sourceLine" id="cb354-21" data-line-number="21"><span class="co"># "ensembl_transcript_id",</span></a> -<a class="sourceLine" id="cb354-22" data-line-number="22"><span class="co"># "ensembl_gene_id", </span></a> -<a class="sourceLine" id="cb354-23" data-line-number="23"><span class="co"># "mgi_symbol",</span></a> -<a class="sourceLine" id="cb354-24" data-line-number="24"><span class="co"># "chromosome_name",</span></a> -<a class="sourceLine" id="cb354-25" data-line-number="25"><span class="co"># "transcript_biotype",</span></a> -<a class="sourceLine" id="cb354-26" data-line-number="26"><span class="co"># "transcript_start",</span></a> -<a class="sourceLine" id="cb354-27" data-line-number="27"><span class="co"># "transcript_end",</span></a> -<a class="sourceLine" id="cb354-28" data-line-number="28"><span class="co"># "transcript_count"</span></a> -<a class="sourceLine" id="cb354-29" data-line-number="29"><span class="co"># ),</span></a> -<a class="sourceLine" id="cb354-30" data-line-number="30"><span class="co"># biomart = "ENSEMBL_MART_ENSEMBL",</span></a> -<a class="sourceLine" id="cb354-31" data-line-number="31"><span class="co"># dataset = "mmusculus_gene_ensembl",</span></a> -<a class="sourceLine" id="cb354-32" data-line-number="32"><span class="co"># host = "www.ensembl.org"</span></a> -<a class="sourceLine" id="cb354-33" data-line-number="33"><span class="co"># )</span></a></code></pre></div> -<p>Some of the genes were not annotated, therefore we filter them out:</p> -<div class="sourceCode" id="cb355"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb355-1" data-line-number="1">umi.qc.ann <-<span class="st"> </span>umi.qc[<span class="op">!</span><span class="kw">is.na</span>(<span class="kw">rowData</span>(umi.qc)<span class="op">$</span>ensembl_gene_id), ]</a></code></pre></div> -<p>Now we compute the total gene length in Kilobases by using the <code>end_position</code> -and <code>start_position</code> fields:</p> -<div class="sourceCode" id="cb356"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb356-1" data-line-number="1">eff_length <-<span class="st"> </span></a> -<a class="sourceLine" id="cb356-2" data-line-number="2"><span class="st"> </span><span class="kw">abs</span>(<span class="kw">rowData</span>(umi.qc.ann)<span class="op">$</span>end_position <span class="op">-</span><span class="st"> </span><span class="kw">rowData</span>(umi.qc.ann)<span class="op">$</span>start_position) <span class="op">/</span><span class="st"> </span><span class="dv">1000</span></a></code></pre></div> -<div class="sourceCode" id="cb357"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb357-1" data-line-number="1"><span class="kw">plot</span>(eff_length, <span class="kw">rowMeans</span>(<span class="kw">counts</span>(umi.qc.ann)))</a></code></pre></div> -<div class="figure" style="text-align: center"><span id="fig:length-vs-mean"></span> -<img src="exprs-norm_files/figure-html/length-vs-mean-1.png" alt="Gene length vs Mean Expression for the raw data" width="90%" /> -<p class="caption"> -Figure 7.9: Gene length vs Mean Expression for the raw data -</p> -</div> -<p>There is no relationship between gene length and mean expression so __FPKM__s & -__TPM__s are inappropriate for this dataset. This is what we would expect for -UMI protocols that tag one end of the transcript. But we will demonstrate them -anyway.</p> -<p><strong>Note</strong> Here calculate the total gene length instead of the total exon length. -Many genes will contain lots of introns so their <code>eff_length</code> will be very -different from what we have calculated. Please consider our calculation as -approximation. If you want to use the total exon lengths, please refer to <a href="https://www.biostars.org/p/83901/">this -page</a>.</p> -<p>Now we are ready to perform the normalisations:</p> -<div class="sourceCode" id="cb358"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb358-1" data-line-number="1"><span class="kw">tpm</span>(umi.qc.ann) <-<span class="st"> </span><span class="kw">log2</span>(<span class="kw">calculateTPM</span>(umi.qc.ann, eff_length) <span class="op">+</span><span class="st"> </span><span class="dv">1</span>)</a></code></pre></div> -<p>Plot the results as a PCA plot:</p> -<div class="sourceCode" id="cb359"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb359-1" data-line-number="1">tmp <-<span class="st"> </span><span class="kw">runPCA</span>(</a> -<a class="sourceLine" id="cb359-2" data-line-number="2"> umi.qc.ann,</a> -<a class="sourceLine" id="cb359-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"tpm"</span>,</a> -<a class="sourceLine" id="cb359-4" data-line-number="4">)</a> -<a class="sourceLine" id="cb359-5" data-line-number="5"><span class="kw">plotPCA</span>(</a> -<a class="sourceLine" id="cb359-6" data-line-number="6"> tmp,</a> -<a class="sourceLine" id="cb359-7" data-line-number="7"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> -<a class="sourceLine" id="cb359-8" data-line-number="8"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb359-9" data-line-number="9"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> -<a class="sourceLine" id="cb359-10" data-line-number="10">)</a></code></pre></div> -<div class="figure" style="text-align: center"><span id="fig:norm-pca-fpkm"></span> -<img src="exprs-norm_files/figure-html/norm-pca-fpkm-1.png" alt="PCA plot of the tung data after TPM normalisation" width="90%" /> -<p class="caption"> -Figure 7.10: PCA plot of the tung data after TPM normalisation -</p> -</div> -<div class="sourceCode" id="cb360"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb360-1" data-line-number="1"><span class="kw">tpm</span>(umi.qc.ann) <-<span class="st"> </span><span class="kw">log2</span>(<span class="kw">calculateFPKM</span>(umi.qc.ann, eff_length) <span class="op">+</span><span class="st"> </span><span class="dv">1</span>)</a></code></pre></div> -<pre><code>## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its -## own size factors</code></pre> -<pre><code>## Warning in .get_all_sf_sets(object): spike-in set 'MT' should have its own -## size factors</code></pre> -<div class="sourceCode" id="cb363"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb363-1" data-line-number="1">tmp <-<span class="st"> </span><span class="kw">runPCA</span>(</a> -<a class="sourceLine" id="cb363-2" data-line-number="2"> umi.qc.ann,</a> -<a class="sourceLine" id="cb363-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"tpm"</span>,</a> -<a class="sourceLine" id="cb363-4" data-line-number="4">)</a> -<a class="sourceLine" id="cb363-5" data-line-number="5"><span class="kw">plotPCA</span>(</a> -<a class="sourceLine" id="cb363-6" data-line-number="6"> tmp,</a> -<a class="sourceLine" id="cb363-7" data-line-number="7"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> -<a class="sourceLine" id="cb363-8" data-line-number="8"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb363-9" data-line-number="9"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> -<a class="sourceLine" id="cb363-10" data-line-number="10">)</a></code></pre></div> -<div class="figure" style="text-align: center"><span id="fig:norm-pca-tpm"></span> -<img src="exprs-norm_files/figure-html/norm-pca-tpm-1.png" alt="PCA plot of the tung data after FPKM normalisation" width="90%" /> -<p class="caption"> -Figure 7.11: PCA plot of the tung data after FPKM normalisation -</p> -</div> -<p><strong>Note</strong> The <code>PCA</code> looks for differences between cells. Gene length is the same -across cells for each gene thus <strong>FPKM</strong> is almost identical to the <strong>CPM</strong> plot -(it is just rotated) since it performs <strong>CPM</strong> first then normalizes gene -length. Whereas, <strong>TPM</strong> is different because it weights genes by their length -before performing __CPM_**.</p> -</div> -<div id="reflection" class="section level3"> -<h3><span class="header-section-number">7.2.6</span> Reflection</h3> -<p><strong>Q:</strong> What is your assessment of the performance of these different -normalization methods on the data presented here?</p> -<p><strong>Q:</strong> Which normalization method would you prefer for this dataset? Why?</p> -</div> -<div id="exercise" class="section level3"> -<h3><span class="header-section-number">7.2.7</span> Exercise</h3> -<p>Perform the same analysis with read counts of the <code>tung</code> data. Use -<code>tung/reads.rds</code> file to load the reads <code>SCE</code> object. Once you have finished -please compare your results to ours (next chapter).</p> -</div> -<div id="sessioninfo-2" class="section level3"> -<h3><span class="header-section-number">7.2.8</span> sessionInfo()</h3> -<pre><code>## R version 3.6.0 (2019-04-26) -## Platform: x86_64-pc-linux-gnu (64-bit) -## Running under: Ubuntu 18.04.3 LTS -## -## Matrix products: default -## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 -## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 -## -## locale: -## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C -## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 -## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 -## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C -## [9] LC_ADDRESS=C LC_TELEPHONE=C -## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C -## -## attached base packages: -## [1] parallel stats4 stats graphics grDevices utils datasets -## [8] methods base -## -## other attached packages: -## [1] scran_1.12.1 scater_1.12.2 -## [3] ggplot2_3.2.1 SingleCellExperiment_1.6.0 -## [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 -## [7] BiocParallel_1.18.1 matrixStats_0.55.0 -## [9] Biobase_2.44.0 GenomicRanges_1.36.1 -## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 -## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 -## [15] scRNA.seq.funcs_0.1.0 -## -## loaded via a namespace (and not attached): -## [1] bitops_1.0-6 bit64_0.9-7 -## [3] httr_1.4.1 progress_1.2.2 -## [5] dynamicTreeCut_1.63-1 backports_1.1.4 -## [7] sctransform_0.2.0 tools_3.6.0 -## [9] R6_2.4.0 irlba_2.3.3 -## [11] hypergeo_1.2-13 vipor_0.4.5 -## [13] DBI_1.0.0 lazyeval_0.2.2 -## [15] colorspace_1.4-1 withr_2.1.2 -## [17] prettyunits_1.0.2 tidyselect_0.2.5 -## [19] gridExtra_2.3 moments_0.14 -## [21] curl_4.2 bit_1.1-14 -## [23] compiler_3.6.0 orthopolynom_1.0-5 -## [25] BiocNeighbors_1.2.0 labeling_0.3 -## [27] bookdown_0.13 scales_1.0.0 -## [29] stringr_1.4.0 digest_0.6.21 -## [31] rmarkdown_1.15 XVector_0.24.0 -## [33] pkgconfig_2.0.3 htmltools_0.3.6 -## [35] limma_3.40.6 highr_0.8 -## [37] rlang_0.4.0 RSQLite_2.1.2 -## [39] DelayedMatrixStats_1.6.1 dplyr_0.8.3 -## [41] RCurl_1.95-4.12 magrittr_1.5 -## [43] BiocSingular_1.0.0 GenomeInfoDbData_1.2.1 -## [45] Matrix_1.2-17 Rcpp_1.0.2 -## [47] ggbeeswarm_0.6.0 munsell_0.5.0 -## [49] viridis_0.5.1 stringi_1.4.3 -## [51] yaml_2.2.0 edgeR_3.26.8 -## [53] MASS_7.3-51.1 zlibbioc_1.30.0 -## [55] Rtsne_0.15 plyr_1.8.4 -## [57] blob_1.2.0 grid_3.6.0 -## [59] listenv_0.7.0 dqrng_0.2.1 -## [61] crayon_1.3.4 contfrac_1.1-12 -## [63] lattice_0.20-38 cowplot_1.0.0 -## [65] hms_0.5.1 locfit_1.5-9.1 -## [67] zeallot_0.1.0 knitr_1.25 -## [69] pillar_1.4.2 igraph_1.2.4.1 -## [71] future.apply_1.3.0 reshape2_1.4.3 -## [73] codetools_0.2-16 biomaRt_2.40.4 -## [75] XML_3.98-1.20 glue_1.3.1 -## [77] evaluate_0.14 deSolve_1.24 -## [79] vctrs_0.2.0 gtable_0.3.0 -## [81] purrr_0.3.2 future_1.14.0 -## [83] assertthat_0.2.1 xfun_0.9 -## [85] rsvd_1.0.2 viridisLite_0.3.0 -## [87] tibble_2.1.3 elliptic_1.4-0 -## [89] memoise_1.1.0 AnnotationDbi_1.46.1 -## [91] beeswarm_0.2.3 globals_0.12.4 -## [93] statmod_1.4.32</code></pre> +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## + | + |================================================= | 75%</code></pre> +<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -</div> -</div> -<div id="normalization-practice-reads" class="section level2"> -<h2><span class="header-section-number">7.3</span> Normalization practice (Reads)</h2> -<div class="figure" style="text-align: center"><span id="fig:norm-pca-raw-reads"></span> -<img src="exprs-norm-reads_files/figure-html/norm-pca-raw-reads-1.png" alt="PCA plot of the tung data" width="90%" /> -<p class="caption"> -Figure 7.12: PCA plot of the tung data -</p> -</div> -<div class="figure" style="text-align: center"><span id="fig:norm-pca-cpm-reads"></span> -<img src="exprs-norm-reads_files/figure-html/norm-pca-cpm-reads-1.png" alt="PCA plot of the tung data after CPM normalisation" width="90%" /> -<p class="caption"> -Figure 7.13: PCA plot of the tung data after CPM normalisation -</p> -</div> -<div class="figure" style="text-align: center"><span id="fig:norm-ours-rle-cpm-reads1"></span> -<img src="exprs-norm-reads_files/figure-html/norm-ours-rle-cpm-reads-1.png" alt="Cell-wise RLE of the tung data" width="90%" /> -<p class="caption"> -Figure 7.14: Cell-wise RLE of the tung data -</p> -</div> -<div class="figure" style="text-align: center"><span id="fig:norm-ours-rle-cpm-reads2"></span> -<img src="exprs-norm-reads_files/figure-html/norm-ours-rle-cpm-reads-2.png" alt="Cell-wise RLE of the tung data" width="90%" /> -<p class="caption"> -Figure 7.15: Cell-wise RLE of the tung data -</p> -</div> -<pre><code>## Warning: Setting 'use.ranks=TRUE' for the old defaults. -## Set 'use.ranks=FALSE' for the new defaults.</code></pre> -<pre><code>## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its -## own size factors</code></pre> -<pre><code>## Warning in .get_all_sf_sets(object): spike-in set 'MT' should have its own -## size factors</code></pre> -<div class="figure" style="text-align: center"><span id="fig:norm-pca-lsf-umi"></span> -<img src="exprs-norm-reads_files/figure-html/norm-pca-lsf-umi-1.png" alt="PCA plot of the tung data after LSF normalisation" width="90%" /> -<p class="caption"> -Figure 7.16: PCA plot of the tung data after LSF normalisation -</p> -</div> -<div class="figure" style="text-align: center"><span id="fig:norm-ours-rle-scran-reads1"></span> -<img src="exprs-norm-reads_files/figure-html/norm-ours-rle-scran-reads-1.png" alt="Cell-wise RLE of the tung data" width="90%" /> -<p class="caption"> -Figure 7.17: Cell-wise RLE of the tung data -</p> -</div> -<div class="figure" style="text-align: center"><span id="fig:norm-ours-rle-scran-reads2"></span> -<img src="exprs-norm-reads_files/figure-html/norm-ours-rle-scran-reads-2.png" alt="Cell-wise RLE of the tung data" width="90%" /> -<p class="caption"> -Figure 7.18: Cell-wise RLE of the tung data -</p> -</div> -<pre><code>## Calculating cell attributes for input UMI matrix</code></pre> -<pre><code>## Variance stabilizing transformation of count matrix of size 16062 by 606</code></pre> -<pre><code>## Model formula is y ~ log10_total_counts_endogenous</code></pre> -<pre><code>## Get Negative Binomial regression parameters per gene</code></pre> -<pre><code>## Using 2000 genes, 606 cells</code></pre> -<pre><code>## - | - | | 0%</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> <pre><code>## | - |======== | 12%</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> + |========================================================= | 88%</code></pre> <pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## - | - |================ | 25%</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached @@ -4391,15 +5725,9 @@ Figure 7.18: Cell-wise RLE of the tung data ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached @@ -4411,25 +5739,11 @@ Figure 7.18: Cell-wise RLE of the tung data ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## - | - |======================== | 38%</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached @@ -4439,37 +5753,15 @@ Figure 7.18: Cell-wise RLE of the tung data ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## - | - |================================ | 50%</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## - | - |========================================= | 62%</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached @@ -4479,858 +5771,458 @@ Figure 7.18: Cell-wise RLE of the tung data ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## - | - |================================================= | 75%</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## - | - |========================================================= | 88%</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> -<pre><code>## Warning in sqrt(1/i): NaNs produced</code></pre> -<pre><code>## - | - |=================================================================| 100%</code></pre> -<pre><code>## Found 1 outliers - those will be ignored in fitting/regularization step</code></pre> -<pre><code>## Second step: Get residuals using fitted parameters for 16062 genes</code></pre> -<pre><code>## - | - | | 0% - | - |= | 2% - | - |== | 3% - | - |=== | 5% - | - |==== | 6% - | - |===== | 8% - | - |====== | 10% - | - |======= | 11% - | - |======== | 13% - | - |========= | 14% - | - |========== | 16% - | - |=========== | 17% - | - |============ | 19% - | - |============= | 21% - | - |============== | 22% - | - |=============== | 24% - | - |================= | 25% - | - |================== | 27% - | - |=================== | 29% - | - |==================== | 30% - | - |===================== | 32% - | - |====================== | 33% - | - |======================= | 35% - | - |======================== | 37% - | - |========================= | 38% - | - |========================== | 40% - | - |=========================== | 41% - | - |============================ | 43% - | - |============================= | 44% - | - |============================== | 46% - | - |=============================== | 48% - | - |================================ | 49% - | - |================================= | 51% - | - |================================== | 52% - | - |=================================== | 54% - | - |==================================== | 56% - | - |===================================== | 57% - | - |====================================== | 59% - | - |======================================= | 60% - | - |======================================== | 62% - | - |========================================= | 63% - | - |========================================== | 65% - | - |=========================================== | 67% - | - |============================================ | 68% - | - |============================================= | 70% - | - |============================================== | 71% - | - |=============================================== | 73% - | - |================================================ | 75% - | - |================================================== | 76% - | - |=================================================== | 78% - | - |==================================================== | 79% - | - |===================================================== | 81% - | - |====================================================== | 83% - | - |======================================================= | 84% - | - |======================================================== | 86% - | - |========================================================= | 87% - | - |========================================================== | 89% - | - |=========================================================== | 90% - | - |============================================================ | 92% - | - |============================================================= | 94% - | - |============================================================== | 95% - | - |=============================================================== | 97% - | - |================================================================ | 98% - | - |=================================================================| 100%</code></pre> -<pre><code>## Calculating gene attributes</code></pre> -<pre><code>## Wall clock passed: Time difference of 16.19305 secs</code></pre> -<pre><code>## [1] 16062 606</code></pre> -<pre><code>## [1] 16062 606</code></pre> -<pre><code>## [1] "y ~ log10_total_counts_endogenous"</code></pre> -<p>Let us look at the NB GLM model parameters estimated by sctransform.</p> -<p><img src="exprs-norm-reads_files/figure-html/norm-ours-sctransform-params-plot-reads-1.png" width="90%" style="display: block; margin: auto;" /></p> -<p>We can look at the effect of sctransform’s normalization on three particular -genes, ACTB, POU5F1 (aka OCT4) and CD74.</p> -<p><img src="exprs-norm-reads_files/figure-html/norm-ours-sctransform-genes-plot-reads-1.png" width="90%" style="display: block; margin: auto;" /></p> -<div class="figure" style="text-align: center"><span id="fig:norm-ours-pca-sctransform-reads"></span> -<img src="exprs-norm-reads_files/figure-html/norm-ours-pca-sctransform-reads-1.png" alt="PCA plot of the tung reads data after sctransform normalisation (Pearson residuals)." width="90%" /> -<p class="caption"> -Figure 7.19: PCA plot of the tung reads data after sctransform normalisation (Pearson residuals). -</p> -</div> -<div class="figure" style="text-align: center"><span id="fig:norm-ours-rle-sctransform-reads"></span> -<img src="exprs-norm-reads_files/figure-html/norm-ours-rle-sctransform-reads-1.png" alt="Cell-wise RLE of the tung reads data" width="90%" /> -<p class="caption"> -Figure 7.20: Cell-wise RLE of the tung reads data -</p> -</div> -<div class="figure" style="text-align: center"><span id="fig:norm-pca-tpm-reads"></span> -<img src="exprs-norm-reads_files/figure-html/norm-pca-tpm-reads-1.png" alt="PCA plot of the tung data after TPM normalisation" width="90%" /> -<p class="caption"> -Figure 7.21: PCA plot of the tung data after TPM normalisation -</p> -</div> -<pre><code>## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its -## own size factors</code></pre> -<pre><code>## Warning in .get_all_sf_sets(object): spike-in set 'MT' should have its own -## size factors</code></pre> -<pre><code>## R version 3.6.0 (2019-04-26) -## Platform: x86_64-pc-linux-gnu (64-bit) -## Running under: Ubuntu 18.04.3 LTS -## -## Matrix products: default -## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 -## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 -## -## locale: -## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C -## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 -## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 -## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C -## [9] LC_ADDRESS=C LC_TELEPHONE=C -## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C -## -## attached base packages: -## [1] parallel stats4 stats graphics grDevices utils datasets -## [8] methods base -## -## other attached packages: -## [1] scran_1.12.1 scater_1.12.2 -## [3] ggplot2_3.2.1 SingleCellExperiment_1.6.0 -## [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 -## [7] BiocParallel_1.18.1 matrixStats_0.55.0 -## [9] Biobase_2.44.0 GenomicRanges_1.36.1 -## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 -## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 -## [15] scRNA.seq.funcs_0.1.0 -## -## loaded via a namespace (and not attached): -## [1] bitops_1.0-6 bit64_0.9-7 -## [3] httr_1.4.1 progress_1.2.2 -## [5] dynamicTreeCut_1.63-1 backports_1.1.4 -## [7] sctransform_0.2.0 tools_3.6.0 -## [9] R6_2.4.0 irlba_2.3.3 -## [11] hypergeo_1.2-13 vipor_0.4.5 -## [13] DBI_1.0.0 lazyeval_0.2.2 -## [15] colorspace_1.4-1 withr_2.1.2 -## [17] prettyunits_1.0.2 tidyselect_0.2.5 -## [19] gridExtra_2.3 moments_0.14 -## [21] curl_4.2 bit_1.1-14 -## [23] compiler_3.6.0 orthopolynom_1.0-5 -## [25] BiocNeighbors_1.2.0 labeling_0.3 -## [27] bookdown_0.13 scales_1.0.0 -## [29] stringr_1.4.0 digest_0.6.21 -## [31] rmarkdown_1.15 XVector_0.24.0 -## [33] pkgconfig_2.0.3 htmltools_0.3.6 -## [35] limma_3.40.6 highr_0.8 -## [37] rlang_0.4.0 RSQLite_2.1.2 -## [39] DelayedMatrixStats_1.6.1 dplyr_0.8.3 -## [41] RCurl_1.95-4.12 magrittr_1.5 -## [43] BiocSingular_1.0.0 GenomeInfoDbData_1.2.1 -## [45] Matrix_1.2-17 Rcpp_1.0.2 -## [47] ggbeeswarm_0.6.0 munsell_0.5.0 -## [49] viridis_0.5.1 stringi_1.4.3 -## [51] yaml_2.2.0 edgeR_3.26.8 -## [53] MASS_7.3-51.1 zlibbioc_1.30.0 -## [55] Rtsne_0.15 plyr_1.8.4 -## [57] blob_1.2.0 grid_3.6.0 -## [59] listenv_0.7.0 dqrng_0.2.1 -## [61] crayon_1.3.4 contfrac_1.1-12 -## [63] lattice_0.20-38 cowplot_1.0.0 -## [65] hms_0.5.1 locfit_1.5-9.1 -## [67] zeallot_0.1.0 knitr_1.25 -## [69] pillar_1.4.2 igraph_1.2.4.1 -## [71] future.apply_1.3.0 reshape2_1.4.3 -## [73] codetools_0.2-16 biomaRt_2.40.4 -## [75] XML_3.98-1.20 glue_1.3.1 -## [77] evaluate_0.14 deSolve_1.24 -## [79] vctrs_0.2.0 gtable_0.3.0 -## [81] purrr_0.3.2 future_1.14.0 -## [83] assertthat_0.2.1 xfun_0.9 -## [85] rsvd_1.0.2 viridisLite_0.3.0 -## [87] tibble_2.1.3 elliptic_1.4-0 -## [89] memoise_1.1.0 AnnotationDbi_1.46.1 -## [91] beeswarm_0.2.3 globals_0.12.4 -## [93] statmod_1.4.32</code></pre> +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -</div> -<div id="identifying-confounding-factors" class="section level2"> -<h2><span class="header-section-number">7.4</span> Identifying confounding factors</h2> -<div id="introduction-5" class="section level3"> -<h3><span class="header-section-number">7.4.1</span> Introduction</h3> -<p>There is a large number of potential confounders, artifacts and biases in -scRNA-seq data. One of the main challenges in analysing scRNA-seq data stems -from the fact that it is difficult to carry out a true technical replication -(why?) to distinguish biological and technical variability. In the previous -chapters we considered normalization and in this chapter we will continue to -explore how experimental artifacts can be identified and removed. We will -continue using the <code>scater</code> package since it provides a set of methods -specifically for quality control of experimental and explanatory variables. -Moreover, we will continue to work with the Blischak data that was used in the -previous chapter.</p> -<div class="sourceCode" id="cb476"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb476-1" data-line-number="1"><span class="kw">library</span>(scater, <span class="dt">quietly =</span> <span class="ot">TRUE</span>)</a> -<a class="sourceLine" id="cb476-2" data-line-number="2"><span class="kw">library</span>(scran)</a> -<a class="sourceLine" id="cb476-3" data-line-number="3"><span class="kw">options</span>(<span class="dt">stringsAsFactors =</span> <span class="ot">FALSE</span>)</a> -<a class="sourceLine" id="cb476-4" data-line-number="4">umi <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/tung/umi.rds"</span>)</a> -<a class="sourceLine" id="cb476-5" data-line-number="5">umi.qc <-<span class="st"> </span>umi[<span class="kw">rowData</span>(umi)<span class="op">$</span>use, <span class="kw">colData</span>(umi)<span class="op">$</span>use]</a> -<a class="sourceLine" id="cb476-6" data-line-number="6">endog_genes <-<span class="st"> </span><span class="op">!</span><span class="kw">rowData</span>(umi.qc)<span class="op">$</span>is_feature_control</a></code></pre></div> -<p>The <code>umi.qc</code> dataset contains filtered cells and genes. Our next step is to -explore technical drivers of variability in the data to inform data -normalisation before downstream analysis.</p> -</div> -<div id="correlations-with-pcs" class="section level3"> -<h3><span class="header-section-number">7.4.2</span> Correlations with PCs</h3> -<p>Let’s first look again at the PCA plot of the QCed dataset using the -scran-normalized log2-CPM values:</p> -<div class="sourceCode" id="cb477"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb477-1" data-line-number="1">qclust <-<span class="st"> </span><span class="kw">quickCluster</span>(umi.qc, <span class="dt">min.size =</span> <span class="dv">30</span>, <span class="dt">use.ranks =</span> <span class="ot">FALSE</span>)</a> -<a class="sourceLine" id="cb477-2" data-line-number="2">umi.qc <-<span class="st"> </span><span class="kw">computeSumFactors</span>(umi.qc, <span class="dt">sizes =</span> <span class="dv">15</span>, <span class="dt">clusters =</span> qclust)</a> -<a class="sourceLine" id="cb477-3" data-line-number="3">umi.qc <-<span class="st"> </span><span class="kw">normalize</span>(umi.qc)</a></code></pre></div> -<pre><code>## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its -## own size factors</code></pre> -<pre><code>## Warning in .get_all_sf_sets(object): spike-in set 'MT' should have its own -## size factors</code></pre> -<div class="sourceCode" id="cb480"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb480-1" data-line-number="1"><span class="kw">reducedDim</span>(umi.qc, <span class="st">"PCA"</span>) <-<span class="st"> </span><span class="kw">reducedDim</span>(</a> -<a class="sourceLine" id="cb480-2" data-line-number="2"> <span class="kw">runPCA</span>(umi.qc[endog_genes,],</a> -<a class="sourceLine" id="cb480-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"logcounts"</span>, <span class="dt">ncomponents =</span> <span class="dv">10</span>), <span class="st">"PCA"</span>)</a> -<a class="sourceLine" id="cb480-4" data-line-number="4"><span class="kw">plotPCA</span>(</a> -<a class="sourceLine" id="cb480-5" data-line-number="5"> umi.qc,</a> -<a class="sourceLine" id="cb480-6" data-line-number="6"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> -<a class="sourceLine" id="cb480-7" data-line-number="7"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span></a> -<a class="sourceLine" id="cb480-8" data-line-number="8">)</a></code></pre></div> -<div class="figure" style="text-align: center"><span id="fig:confound-pca"></span> -<img src="confounders_files/figure-html/confound-pca-1.png" alt="PCA plot of the tung data" width="90%" /> -<p class="caption"> -Figure 7.22: PCA plot of the tung data -</p> -</div> -<p><code>scater</code> allows one to identify principal components that correlate with -experimental and QC variables of interest (it ranks principle components by -<span class="math inline">\(R^2\)</span> from a linear model regressing PC value against the variable of interest).</p> -<p>Let’s test whether some of the variables correlate with any of the PCs.</p> -<div id="top-coldata-variables-associated-with-pcs" class="section level4"> -<h4><span class="header-section-number">7.4.2.1</span> Top colData variables associated with PCs</h4> -<p>The plot below shows, for each of the first 10 PCs, the variance explained by -the ten variables in <code>colData(umi.qc)</code> that are most strongly associated with -the PCs. [We will ignore the <code>sample_id</code> variable: it has a unique value for -each cell, so can explain all the variation for all PCs.]</p> -<div class="sourceCode" id="cb481"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb481-1" data-line-number="1"><span class="kw">plotExplanatoryPCs</span>(umi.qc)</a></code></pre></div> -<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): -## ignoring 'is_cell_control' with fewer than 2 unique levels</code></pre> -<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): -## ignoring 'pct_counts_in_top_100_features_feature_control' with fewer than 2 -## unique levels</code></pre> -<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): -## ignoring 'pct_counts_in_top_200_features_feature_control' with fewer than 2 -## unique levels</code></pre> -<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): -## ignoring 'pct_counts_in_top_500_features_feature_control' with fewer than 2 -## unique levels</code></pre> -<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): -## ignoring 'pct_counts_in_top_50_features_ERCC' with fewer than 2 unique -## levels</code></pre> -<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): -## ignoring 'pct_counts_in_top_100_features_ERCC' with fewer than 2 unique -## levels</code></pre> -<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): -## ignoring 'pct_counts_in_top_200_features_ERCC' with fewer than 2 unique -## levels</code></pre> -<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): -## ignoring 'pct_counts_in_top_500_features_ERCC' with fewer than 2 unique -## levels</code></pre> -<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): -## ignoring 'pct_counts_in_top_50_features_MT' with fewer than 2 unique levels</code></pre> -<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): -## ignoring 'pct_counts_in_top_100_features_MT' with fewer than 2 unique -## levels</code></pre> -<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): -## ignoring 'pct_counts_in_top_200_features_MT' with fewer than 2 unique -## levels</code></pre> -<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): -## ignoring 'pct_counts_in_top_500_features_MT' with fewer than 2 unique -## levels</code></pre> -<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): -## ignoring 'use' with fewer than 2 unique levels</code></pre> -<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): -## ignoring 'outlier' with fewer than 2 unique levels</code></pre> -<pre><code>## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -## -Inf +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -## -Inf +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -## -Inf +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -## -Inf +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -## -Inf +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -## -Inf +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -## -Inf +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -## -Inf +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -## -Inf +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -## -Inf +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -## -Inf +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -## -Inf +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -## -Inf +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -## -Inf</code></pre> -<div class="figure" style="text-align: center"><span id="fig:confound-find-pcs-total-features"></span> -<img src="confounders_files/figure-html/confound-find-pcs-total-features-1.png" alt="PC correlation with the number of detected genes" width="90%" /> -<p class="caption"> -Figure 7.23: PC correlation with the number of detected genes -</p> -</div> -<p>Indeed, we can see that <code>PC1</code> can be almost completely explained by <code>batch</code> and -<code>individual</code> (of course batch is nested within individual). The total counts -from ERCC spike-ins also explains a substantial proportion of the variability in -PC1.</p> -<p>Although number of detected genes is not strongly correlated with the PCs here -(after normalization), this is commonly the case and something to look out for. -[You might like to replicate the plot above using raw logcounts values to see -what happens without normalization]. This is a well-known issue in scRNA-seq and -was described <a href="http://biorxiv.org/content/early/2015/12/27/025528">here</a>.</p> -</div> -</div> -<div id="explanatory-variables" class="section level3"> -<h3><span class="header-section-number">7.4.3</span> Explanatory variables</h3> -<p><code>scater</code> can also compute the marginal <span class="math inline">\(R^2\)</span> for each variable when fitting a -linear model regressing expression values for each gene against just that -variable, and display a density plot of the gene-wise marginal <span class="math inline">\(R^2\)</span> values for -the variables.</p> -<div class="sourceCode" id="cb497"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb497-1" data-line-number="1"><span class="kw">plotExplanatoryVariables</span>(</a> -<a class="sourceLine" id="cb497-2" data-line-number="2"> umi.qc,</a> -<a class="sourceLine" id="cb497-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"logcounts_raw"</span>,</a> -<a class="sourceLine" id="cb497-4" data-line-number="4"> <span class="dt">variables =</span> <span class="kw">c</span>(</a> -<a class="sourceLine" id="cb497-5" data-line-number="5"> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb497-6" data-line-number="6"> <span class="st">"total_counts"</span>,</a> -<a class="sourceLine" id="cb497-7" data-line-number="7"> <span class="st">"batch"</span>,</a> -<a class="sourceLine" id="cb497-8" data-line-number="8"> <span class="st">"individual"</span>,</a> -<a class="sourceLine" id="cb497-9" data-line-number="9"> <span class="st">"pct_counts_ERCC"</span>,</a> -<a class="sourceLine" id="cb497-10" data-line-number="10"> <span class="st">"pct_counts_MT"</span></a> -<a class="sourceLine" id="cb497-11" data-line-number="11"> )</a> -<a class="sourceLine" id="cb497-12" data-line-number="12">)</a></code></pre></div> -<div class="figure" style="text-align: center"><span id="fig:confound-find-expl-vars"></span> -<img src="confounders_files/figure-html/confound-find-expl-vars-1.png" alt="Explanatory variables" width="90%" /> -<p class="caption"> -Figure 7.24: Explanatory variables -</p> -</div> -<p>This analysis indicates that the number of detected genes (again) and also the -sequencing depth (total number of UMI counts per cell) have substantial -explanatory power for many genes, so these variables are good candidates for -conditioning out in a normalization step, or including in downstream statistical -models [cf. <code>sctransform</code>’s approach to normalization]. Expression of ERCCs also -appears to be an important explanatory variable and one notable feature of the -above plot is that batch explains more than individual. What does that tell us -about the technical and biological variability of the data?</p> -</div> -<div id="other-confounders" class="section level3"> -<h3><span class="header-section-number">7.4.4</span> Other confounders</h3> -<p>In addition to correcting for batch, there are other factors that one may want -to compensate for. As with batch correction, these adjustments require extrinsic -information. One popular method is <a href="https://github.com/PMBio/scLVM">scLVM</a> which -allows you to identify and subtract the effect from processes such as cell-cycle -or apoptosis.</p> -<p>In addition, protocols may differ in terms of their coverage of each transcript, -their bias based on the average content of <strong>A/T</strong> nucleotides, or their ability -to capture short transcripts. Ideally, we would like to compensate for all of -these differences and biases.</p> -</div> -<div id="exercise-1" class="section level3"> -<h3><span class="header-section-number">7.4.5</span> Exercise</h3> -<p>Perform the same analysis with read counts of the Blischak data. Use -<code>tung/reads.rds</code> file to load the reads SCESet object. Once you have finished -please compare your results to ours (next chapter).</p> -</div> -<div id="sessioninfo-3" class="section level3"> -<h3><span class="header-section-number">7.4.6</span> sessionInfo()</h3> -<pre><code>## R version 3.6.0 (2019-04-26) -## Platform: x86_64-pc-linux-gnu (64-bit) -## Running under: Ubuntu 18.04.3 LTS -## -## Matrix products: default -## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 -## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 -## -## locale: -## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C -## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 -## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 -## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C -## [9] LC_ADDRESS=C LC_TELEPHONE=C -## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C -## -## attached base packages: -## [1] parallel stats4 stats graphics grDevices utils datasets -## [8] methods base -## -## other attached packages: -## [1] scran_1.12.1 scater_1.12.2 -## [3] ggplot2_3.2.1 SingleCellExperiment_1.6.0 -## [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 -## [7] BiocParallel_1.18.1 matrixStats_0.55.0 -## [9] Biobase_2.44.0 GenomicRanges_1.36.1 -## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 -## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 -## [15] knitr_1.25 -## -## loaded via a namespace (and not attached): -## [1] locfit_1.5-9.1 Rcpp_1.0.2 -## [3] rsvd_1.0.2 lattice_0.20-38 -## [5] assertthat_0.2.1 digest_0.6.21 -## [7] R6_2.4.0 dynamicTreeCut_1.63-1 -## [9] evaluate_0.14 highr_0.8 -## [11] pillar_1.4.2 zlibbioc_1.30.0 -## [13] rlang_0.4.0 lazyeval_0.2.2 -## [15] irlba_2.3.3 Matrix_1.2-17 -## [17] rmarkdown_1.15 labeling_0.3 -## [19] BiocNeighbors_1.2.0 statmod_1.4.32 -## [21] stringr_1.4.0 igraph_1.2.4.1 -## [23] RCurl_1.95-4.12 munsell_0.5.0 -## [25] compiler_3.6.0 vipor_0.4.5 -## [27] BiocSingular_1.0.0 xfun_0.9 -## [29] pkgconfig_2.0.3 ggbeeswarm_0.6.0 -## [31] htmltools_0.3.6 tidyselect_0.2.5 -## [33] tibble_2.1.3 gridExtra_2.3 -## [35] GenomeInfoDbData_1.2.1 bookdown_0.13 -## [37] edgeR_3.26.8 viridisLite_0.3.0 -## [39] crayon_1.3.4 dplyr_0.8.3 -## [41] withr_2.1.2 bitops_1.0-6 -## [43] grid_3.6.0 gtable_0.3.0 -## [45] magrittr_1.5 scales_1.0.0 -## [47] dqrng_0.2.1 stringi_1.4.3 -## [49] XVector_0.24.0 viridis_0.5.1 -## [51] limma_3.40.6 DelayedMatrixStats_1.6.1 -## [53] cowplot_1.0.0 tools_3.6.0 -## [55] glue_1.3.1 beeswarm_0.2.3 -## [57] purrr_0.3.2 yaml_2.2.0 -## [59] colorspace_1.4-1</code></pre> +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -</div> -</div> -<div id="identifying-confounding-factors-reads" class="section level2"> -<h2><span class="header-section-number">7.5</span> Identifying confounding factors (Reads)</h2> -<pre><code>## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its -## own size factors</code></pre> -<pre><code>## Warning in .get_all_sf_sets(object): spike-in set 'MT' should have its own -## size factors</code></pre> -<div class="figure" style="text-align: center"><span id="fig:confound-pca-reads"></span> -<img src="confounders-reads_files/figure-html/confound-pca-reads-1.png" alt="PCA plot of the tung data" width="90%" /> -<p class="caption"> -Figure 7.25: PCA plot of the tung data -</p> -</div> -<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): -## ignoring 'is_cell_control' with fewer than 2 unique levels</code></pre> -<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): -## ignoring 'pct_counts_in_top_100_features_feature_control' with fewer than 2 -## unique levels</code></pre> -<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): -## ignoring 'pct_counts_in_top_200_features_feature_control' with fewer than 2 -## unique levels</code></pre> -<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): -## ignoring 'pct_counts_in_top_500_features_feature_control' with fewer than 2 -## unique levels</code></pre> -<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): -## ignoring 'pct_counts_in_top_50_features_ERCC' with fewer than 2 unique -## levels</code></pre> -<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): -## ignoring 'pct_counts_in_top_100_features_ERCC' with fewer than 2 unique -## levels</code></pre> -<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): -## ignoring 'pct_counts_in_top_200_features_ERCC' with fewer than 2 unique -## levels</code></pre> -<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): -## ignoring 'pct_counts_in_top_500_features_ERCC' with fewer than 2 unique -## levels</code></pre> -<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): -## ignoring 'pct_counts_in_top_50_features_MT' with fewer than 2 unique levels</code></pre> -<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): -## ignoring 'pct_counts_in_top_100_features_MT' with fewer than 2 unique -## levels</code></pre> -<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): -## ignoring 'pct_counts_in_top_200_features_MT' with fewer than 2 unique -## levels</code></pre> -<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): -## ignoring 'pct_counts_in_top_500_features_MT' with fewer than 2 unique -## levels</code></pre> -<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): -## ignoring 'use' with fewer than 2 unique levels</code></pre> -<pre><code>## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): -## ignoring 'outlier' with fewer than 2 unique levels</code></pre> -<pre><code>## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -## -Inf +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -## -Inf +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -## -Inf +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -## -Inf +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -## -Inf +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -## -Inf +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -## -Inf +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -## -Inf +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -## -Inf +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -## -Inf +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -## -Inf +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -## -Inf +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -## -Inf +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -## -Inf</code></pre> -<div class="figure" style="text-align: center"><span id="fig:confound-find-pcs-reads"></span> -<img src="confounders-reads_files/figure-html/confound-find-pcs-reads-1.png" alt="PC correlation with the number of detected genes" width="90%" /> -<p class="caption"> -Figure 7.26: PC correlation with the number of detected genes -</p> -</div> -<div class="figure" style="text-align: center"><span id="fig:confound-find-expl-vars-reads"></span> -<img src="confounders-reads_files/figure-html/confound-find-expl-vars-reads-1.png" alt="Explanatory variables" width="90%" /> -<p class="caption"> -Figure 7.27: Explanatory variables -</p> -</div> -<pre><code>## R version 3.6.0 (2019-04-26) -## Platform: x86_64-pc-linux-gnu (64-bit) -## Running under: Ubuntu 18.04.3 LTS -## -## Matrix products: default -## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 -## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 -## -## locale: -## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C -## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 -## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 -## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C -## [9] LC_ADDRESS=C LC_TELEPHONE=C -## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C -## -## attached base packages: -## [1] parallel stats4 stats graphics grDevices utils datasets -## [8] methods base -## -## other attached packages: -## [1] scran_1.12.1 scater_1.12.2 -## [3] ggplot2_3.2.1 SingleCellExperiment_1.6.0 -## [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 -## [7] BiocParallel_1.18.1 matrixStats_0.55.0 -## [9] Biobase_2.44.0 GenomicRanges_1.36.1 -## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 -## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 -## -## loaded via a namespace (and not attached): -## [1] locfit_1.5-9.1 Rcpp_1.0.2 -## [3] rsvd_1.0.2 lattice_0.20-38 -## [5] assertthat_0.2.1 digest_0.6.21 -## [7] R6_2.4.0 dynamicTreeCut_1.63-1 -## [9] evaluate_0.14 highr_0.8 -## [11] pillar_1.4.2 zlibbioc_1.30.0 -## [13] rlang_0.4.0 lazyeval_0.2.2 -## [15] irlba_2.3.3 Matrix_1.2-17 -## [17] rmarkdown_1.15 labeling_0.3 -## [19] BiocNeighbors_1.2.0 statmod_1.4.32 -## [21] stringr_1.4.0 igraph_1.2.4.1 -## [23] RCurl_1.95-4.12 munsell_0.5.0 -## [25] compiler_3.6.0 vipor_0.4.5 -## [27] BiocSingular_1.0.0 xfun_0.9 -## [29] pkgconfig_2.0.3 ggbeeswarm_0.6.0 -## [31] htmltools_0.3.6 tidyselect_0.2.5 -## [33] tibble_2.1.3 gridExtra_2.3 -## [35] GenomeInfoDbData_1.2.1 bookdown_0.13 -## [37] edgeR_3.26.8 viridisLite_0.3.0 -## [39] crayon_1.3.4 dplyr_0.8.3 -## [41] withr_2.1.2 bitops_1.0-6 -## [43] grid_3.6.0 gtable_0.3.0 -## [45] magrittr_1.5 scales_1.0.0 -## [47] dqrng_0.2.1 stringi_1.4.3 -## [49] XVector_0.24.0 viridis_0.5.1 -## [51] limma_3.40.6 DelayedMatrixStats_1.6.1 -## [53] cowplot_1.0.0 tools_3.6.0 -## [55] glue_1.3.1 beeswarm_0.2.3 -## [57] purrr_0.3.2 yaml_2.2.0 -## [59] colorspace_1.4-1 knitr_1.25</code></pre> +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached</code></pre> +<pre><code>## + | + |=================================================================| 100%</code></pre> +<pre><code>## Found 123 outliers - those will be ignored in fitting/regularization step</code></pre> +<pre><code>## Second step: Get residuals using fitted parameters for 14066 genes</code></pre> +<pre><code>## + | + | | 0% + | + |= | 2% + | + |== | 4% + | + |==== | 5% + | + |===== | 7% + | + |====== | 9% + | + |======= | 11% + | + |======== | 13% + | + |========= | 15% + | + |=========== | 16% + | + |============ | 18% + | + |============= | 20% + | + |============== | 22% + | + |=============== | 24% + | + |================= | 25% + | + |================== | 27% + | + |=================== | 29% + | + |==================== | 31% + | + |===================== | 33% + | + |====================== | 35% + | + |======================== | 36% + | + |========================= | 38% + | + |========================== | 40% + | + |=========================== | 42% + | + |============================ | 44% + | + |============================== | 45% + | + |=============================== | 47% + | + |================================ | 49% + | + |================================= | 51% + | + |================================== | 53% + | + |=================================== | 55% + | + |===================================== | 56% + | + |====================================== | 58% + | + |======================================= | 60% + | + |======================================== | 62% + | + |========================================= | 64% + | + |=========================================== | 65% + | + |============================================ | 67% + | + |============================================= | 69% + | + |============================================== | 71% + | + |=============================================== | 73% + | + |================================================ | 75% + | + |================================================== | 76% + | + |=================================================== | 78% + | + |==================================================== | 80% + | + |===================================================== | 82% + | + |====================================================== | 84% + | + |======================================================== | 85% + | + |========================================================= | 87% + | + |========================================================== | 89% + | + |=========================================================== | 91% + | + |============================================================ | 93% + | + |============================================================= | 95% + | + |=============================================================== | 96% + | + |================================================================ | 98% + | + |=================================================================| 100%</code></pre> +<pre><code>## Calculating gene attributes</code></pre> +<pre><code>## Wall clock passed: Time difference of 28.12818 secs</code></pre> +<div class="sourceCode" id="cb530"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb530-1" data-line-number="1"><span class="co">## Pearson residuals, or deviance residuals</span></a> +<a class="sourceLine" id="cb530-2" data-line-number="2">sctnorm_data<span class="op">$</span>model_str</a></code></pre></div> +<pre><code>## [1] "y ~ log10_total_counts_endogenous + batch"</code></pre> +<div class="sourceCode" id="cb532"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb532-1" data-line-number="1"><span class="kw">assay</span>(umi.qc, <span class="st">"sctrans_norm"</span>) <-<span class="st"> </span>sctnorm_data<span class="op">$</span>y</a></code></pre></div> +<p>Let us look at the NB GLM model parameters estimated by sctransform.</p> +<div class="sourceCode" id="cb533"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb533-1" data-line-number="1"><span class="co">#sce$log10_total_counts</span></a> +<a class="sourceLine" id="cb533-2" data-line-number="2"><span class="co">## Matrix of estimated model parameters per gene (theta and regression coefficients)</span></a> +<a class="sourceLine" id="cb533-3" data-line-number="3">sctransform<span class="op">::</span><span class="kw">plot_model_pars</span>(sctnorm_data)</a></code></pre></div> +<p><img src="remove-conf_files/figure-html/sctransform-params-plot-1.png" width="90%" style="display: block; margin: auto;" /></p> +<p>Do these parameters and the regularization look sensible to you? Any concerns?</p> +<div class="sourceCode" id="cb534"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb534-1" data-line-number="1"><span class="kw">reducedDim</span>(umi.qc, <span class="st">"PCA_sctrans_norm"</span>) <-<span class="st"> </span><span class="kw">reducedDim</span>(</a> +<a class="sourceLine" id="cb534-2" data-line-number="2"> <span class="kw">runPCA</span>(umi.qc[endog_genes, ], <span class="dt">exprs_values =</span> <span class="st">"sctrans_norm"</span>)</a> +<a class="sourceLine" id="cb534-3" data-line-number="3">)</a> +<a class="sourceLine" id="cb534-4" data-line-number="4"><span class="kw">plotReducedDim</span>(</a> +<a class="sourceLine" id="cb534-5" data-line-number="5"> umi.qc,</a> +<a class="sourceLine" id="cb534-6" data-line-number="6"> <span class="dt">use_dimred =</span> <span class="st">"PCA_sctrans_norm"</span>,</a> +<a class="sourceLine" id="cb534-7" data-line-number="7"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> +<a class="sourceLine" id="cb534-8" data-line-number="8"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb534-9" data-line-number="9"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> +<a class="sourceLine" id="cb534-10" data-line-number="10">) <span class="op">+</span><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"PCA plot: sctransform normalization"</span>) </a></code></pre></div> +<div class="figure" style="text-align: center"><span id="fig:norm-pca-sctransform"></span> +<img src="remove-conf_files/figure-html/norm-pca-sctransform-1.png" alt="PCA plot of the tung data after sctransform normalisation (Pearson residuals)." width="90%" /> +<p class="caption"> +Figure 7.7: PCA plot of the tung data after sctransform normalisation (Pearson residuals). +</p> </div> -<div id="dealing-with-confounders" class="section level2"> -<h2><span class="header-section-number">7.6</span> Dealing with confounders</h2> -<div id="introduction-6" class="section level3"> -<h3><span class="header-section-number">7.6.1</span> Introduction</h3> -<p>In the previous chapter we normalized for library size, effectively removing it -as a confounder. Now we will consider removing other less well defined -confounders from our data. Technical confounders (aka batch effects) can arise -from difference in reagents, isolation methods, the lab/experimenter who -performed the experiment, even which day/time the experiment was performed. -Accounting for technical confounders, and batch effects particularly, is a large -topic that also involves principles of experimental design. Here we address -approaches that can be taken to account for confounders when the experimental -design is appropriate.</p> -<p>Fundamentally, accounting for technical confounders involves identifying and, -ideally, removing sources of variation in the expression data that are not -related to (i.e. are confounding) the biological signal of interest. Various -approaches exist, some of which use spike-in or housekeeping genes, and some of -which use endogenous genes.</p> -<div id="advantages-and-disadvantages-of-using-spike-ins-to-remove-confounders" class="section level4"> -<h4><span class="header-section-number">7.6.1.1</span> Advantages and disadvantages of using spike-ins to remove confounders</h4> -<p>The use of spike-ins as control genes is conceptually appealing, since (ideally) -the same amount of ERCC (or other) spike-in would be added to each cell in our -experiment. In principle, all the variability we observe for these ``genes’’ is -due to technical noise; whereas endogenous genes are affected by both technical -noise and biological variability. Technical noise can be removed by fitting a -model to the spike-ins and “substracting†this from the endogenous genes. There -are several methods available based on this premise (eg. -<a href="https://github.com/catavallejos/BASiCS">BASiCS</a>, -<a href="https://github.com/PMBio/scLVM">scLVM</a>, -<a href="http://bioconductor.org/packages/release/bioc/html/RUVSeq.html">RUVg</a>); each -using different noise models and different fitting procedures. Alternatively, -one can identify genes which exhibit significant variation beyond technical -noise (eg. Distance to median, <a href="http://www.nature.com/nmeth/journal/v10/n11/full/nmeth.2645.html">Highly variable -genes</a>).</p> -<p>Unfortunately, there are major issues with the use of spike-ins for -normalisation that limit their utility in practice. Perhaps surprisingly, their -variability can, for various reasons, actually be <em>higher</em> than that of -endogenous genes. One key reason for the difficulty of their use in practice is -the need to pipette miniscule volumes of spike-in solution into The most popular -set of spike-ins, namely ERCCs, are derived from bacterial sequences, which -raises concerns that their base content and structure diverges to far from gene -structure in other biological systems of interest (e.g. mammalian genes) to be -reliable for normalisation. Even in the best-case scenarios, spike-ins are -limited to use on plate-based platforms; they are fundamentally incompatible -with droplet-based platforms.</p> -<p>Given the issues with using spike-ins, better results can often be obtained by -using endogenous genes instead. Given their limited availability, normalisation -methods based only on endogenous genes needed to be developed and we consider -them generally preferable, even for platforms where spike-ins may be used. Where -we have a large number of endogenous genes that, on average, do not vary -systematically between cells and where we expect technical effects to affect a -large number of genes (a very common and reasonable assumption), then such -methods (for example, the RUVs method) can perform well.</p> -<p>We explore both general approaches below.</p> -<div class="sourceCode" id="cb517"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb517-1" data-line-number="1"><span class="kw">library</span>(scRNA.seq.funcs)</a> -<a class="sourceLine" id="cb517-2" data-line-number="2"><span class="kw">library</span>(RUVSeq)</a> -<a class="sourceLine" id="cb517-3" data-line-number="3"><span class="kw">library</span>(scater)</a> -<a class="sourceLine" id="cb517-4" data-line-number="4"><span class="kw">library</span>(SingleCellExperiment)</a> -<a class="sourceLine" id="cb517-5" data-line-number="5"><span class="kw">library</span>(scran)</a> -<a class="sourceLine" id="cb517-6" data-line-number="6"><span class="kw">library</span>(kBET)</a> -<a class="sourceLine" id="cb517-7" data-line-number="7"><span class="kw">library</span>(sva) <span class="co"># Combat</span></a> -<a class="sourceLine" id="cb517-8" data-line-number="8"><span class="kw">library</span>(edgeR)</a> -<a class="sourceLine" id="cb517-9" data-line-number="9"><span class="kw">library</span>(harmony)</a> -<a class="sourceLine" id="cb517-10" data-line-number="10"><span class="kw">set.seed</span>(<span class="dv">1234567</span>)</a> -<a class="sourceLine" id="cb517-11" data-line-number="11"><span class="kw">options</span>(<span class="dt">stringsAsFactors =</span> <span class="ot">FALSE</span>)</a> -<a class="sourceLine" id="cb517-12" data-line-number="12">umi <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/tung/umi.rds"</span>)</a> -<a class="sourceLine" id="cb517-13" data-line-number="13">umi.qc <-<span class="st"> </span>umi[<span class="kw">rowData</span>(umi)<span class="op">$</span>use, <span class="kw">colData</span>(umi)<span class="op">$</span>use]</a> -<a class="sourceLine" id="cb517-14" data-line-number="14">endog_genes <-<span class="st"> </span><span class="op">!</span><span class="kw">rowData</span>(umi.qc)<span class="op">$</span>is_feature_control</a> -<a class="sourceLine" id="cb517-15" data-line-number="15">erccs <-<span class="st"> </span><span class="kw">rowData</span>(umi.qc)<span class="op">$</span>is_feature_control</a> -<a class="sourceLine" id="cb517-16" data-line-number="16"><span class="co">## Apply scran sum factor normalization</span></a> -<a class="sourceLine" id="cb517-17" data-line-number="17">qclust <-<span class="st"> </span><span class="kw">quickCluster</span>(umi.qc, <span class="dt">min.size =</span> <span class="dv">30</span>, <span class="dt">use.ranks =</span> <span class="ot">FALSE</span>)</a> -<a class="sourceLine" id="cb517-18" data-line-number="18">umi.qc <-<span class="st"> </span><span class="kw">computeSumFactors</span>(umi.qc, <span class="dt">sizes =</span> <span class="dv">15</span>, <span class="dt">clusters =</span> qclust)</a> -<a class="sourceLine" id="cb517-19" data-line-number="19">umi.qc <-<span class="st"> </span><span class="kw">normalize</span>(umi.qc)</a></code></pre></div> -</div> -</div> -<div id="linear-models" class="section level3"> -<h3><span class="header-section-number">7.6.2</span> Linear models</h3> -<p>Linear models offer a relatively simple approach to accounting for batch effects -and confounders. A linear model can correct for batches while preserving -biological effects if you have a balanced design. In a confounded/replicate -design biological effects will not be fit/preserved. We could remove batch -effects from each individual separately in order to preserve biological (and -technical) variance between individuals (we will apply a similar with -<code>mnnCorrect</code>, below).</p> -<p>Depending on how we have pre-processed our scRNA-seq data or what modelling -assumptions we are willing to make, we may choose to use normal (Gaussian) -linear models (i.e. assuming a normal distribution for noise) or generalized -linear models (GLM), where we can use any distribution from the exponential -family. Given that we obtain highly-variable count data from scRNA-seq assays, -the obvious choice for a GLM is to use the negative binomial distribution, which -has proven highly successful in the analysis of bulk RNA-seq data.</p> -<p>For demonstration purposes here we will naively correct all confounded batch -effects.</p> -<div id="gaussian-normal-linear-models" class="section level4"> -<h4><span class="header-section-number">7.6.2.1</span> Gaussian (normal) linear models</h4> -<p>The <a href="https://bioconductor.org/packages/release/bioc/html/limma.html"><code>limma</code></a> -package in Bioconductor offers a convenient and efficient means to fit a linear -model (with the same design matrix) to a dataset with a large number of features -(i.e. genes) <span class="citation">(Ritchie et al. <a href="#ref-Ritchie2015-ra">2015</a>)</span>. An added advantage of <code>limma</code> is its ability to -apply empirical Bayes squeezing of variance estimate to improve inference.</p> -<p>Provided we are satisfied making the assumption of a Gaussian distribution for -residuals (this may be reasonable for normalized log-counts in many cases; but -it may not be—debate continues in the literature), then we can apply <code>limma</code> -to regress out (known) unwanted sources of variation as follows.</p> -<div class="sourceCode" id="cb518"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb518-1" data-line-number="1"><span class="co">## fit a model just accounting for batch</span></a> -<a class="sourceLine" id="cb518-2" data-line-number="2">lm_design_batch <-<span class="st"> </span><span class="kw">model.matrix</span>(<span class="op">~</span><span class="dv">0</span> <span class="op">+</span><span class="st"> </span>batch, <span class="dt">data =</span> <span class="kw">colData</span>(umi.qc))</a> -<a class="sourceLine" id="cb518-3" data-line-number="3">fit_lm_batch <-<span class="st"> </span><span class="kw">lmFit</span>(<span class="kw">logcounts</span>(umi.qc), lm_design_batch)</a> -<a class="sourceLine" id="cb518-4" data-line-number="4">resids_lm_batch <-<span class="st"> </span><span class="kw">residuals</span>(fit_lm_batch, <span class="kw">logcounts</span>(umi.qc))</a> -<a class="sourceLine" id="cb518-5" data-line-number="5"><span class="kw">assay</span>(umi.qc, <span class="st">"lm_batch"</span>) <-<span class="st"> </span>resids_lm_batch</a> -<a class="sourceLine" id="cb518-6" data-line-number="6"></a> -<a class="sourceLine" id="cb518-7" data-line-number="7"><span class="kw">reducedDim</span>(umi.qc, <span class="st">"PCA_lm_batch"</span>) <-<span class="st"> </span><span class="kw">reducedDim</span>(</a> -<a class="sourceLine" id="cb518-8" data-line-number="8"> <span class="kw">runPCA</span>(umi.qc[endog_genes, ], <span class="dt">exprs_values =</span> <span class="st">"lm_batch"</span>), <span class="st">"PCA"</span>)</a> -<a class="sourceLine" id="cb518-9" data-line-number="9"></a> -<a class="sourceLine" id="cb518-10" data-line-number="10"><span class="kw">plotReducedDim</span>(umi.qc, <span class="dt">use_dimred =</span> <span class="st">"PCA_lm_batch"</span>,</a> -<a class="sourceLine" id="cb518-11" data-line-number="11"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>, </a> -<a class="sourceLine" id="cb518-12" data-line-number="12"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb518-13" data-line-number="13"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> -<a class="sourceLine" id="cb518-14" data-line-number="14"> ) <span class="op">+</span></a> -<a class="sourceLine" id="cb518-15" data-line-number="15"><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"LM - regress out batch"</span>)</a></code></pre></div> -<p>Two problems are immediately apparent with the approach above. First, batch is -nested within individual, so simply regressing out batch as we have done above -also regresses out differences between individuals that we would like to -preserve. Second, we observe that the first principal component seems to -separate cells by number of genes (features) expressed, which is undesirable.</p> -<p>We can address these concerns by correcting for batch within each individual -separately, and also fitting the proportion of genes expressed per cell as a -covariate. [NB: to preserve overall differences in expression levels between -individuals we will need to apply a slight hack to the LM fit results (setting -the intercept coefficient to zero).]</p> -<p><strong>Exercise 2</strong></p> -<p>Perform LM correction for each individual separately. Store the final corrected -matrix in the <code>lm_batch_indi</code> slot.</p> -<p>What do you think of the results of this approach?</p> -</div> -<div id="negative-binomial-generalized-linear-models" class="section level4"> -<h4><span class="header-section-number">7.6.2.2</span> Negative binomial generalized linear models</h4> -</div> -</div> -<div id="sctransform-2" class="section level3"> -<h3><span class="header-section-number">7.6.3</span> sctransform</h3> +<p><strong>Q:</strong> What’s happened here? Was that expected? Any other comments?</p> </div> <div id="remove-unwanted-variation" class="section level3"> <h3><span class="header-section-number">7.6.4</span> Remove Unwanted Variation</h3> @@ -5352,10 +6244,12 @@ procedure (such as upper-quartile normalization). The simultaneous estimation of following three approaches to estimate the factors of unwanted variation <span class="math inline">\(W\)</span> are used:</p> <ul> -<li><em>RUVg</em> uses negative control genes (e.g. ERCCs), assumed to have constant expression across samples;</li> -<li><em>RUVs</em> uses centered (technical) replicate/negative control samples for which the covariates of interest are -constant;</li> -<li><em>RUVr</em> uses residuals, e.g., from a first-pass GLM regression of the counts on the covariates of interest.</li> +<li><em>RUVg</em> uses negative control genes (e.g. ERCCs), assumed to have constant +expression across samples;</li> +<li><em>RUVs</em> uses centered (technical) replicate/negative control samples for which +the covariates of interest are constant;</li> +<li><em>RUVr</em> uses residuals, e.g., from a first-pass GLM regression of the counts on +the covariates of interest.</li> </ul> <p>We will concentrate on the first two approaches.</p> <div id="ruvg" class="section level4"> @@ -5367,14 +6261,14 @@ represent normalized counts-per-million and then apply a log2 transformation. We run <em>RUVg</em> twice, with <span class="math inline">\(k=1\)</span> and <span class="math inline">\(k=10\)</span> so that we can compare the effect of estimating different number of hidden factors to capture unwanted variation in the data.</p> -<div class="sourceCode" id="cb519"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb519-1" data-line-number="1">ruvg <-<span class="st"> </span><span class="kw">RUVg</span>(<span class="kw">counts</span>(umi.qc), erccs, <span class="dt">k =</span> <span class="dv">1</span>)</a> -<a class="sourceLine" id="cb519-2" data-line-number="2"><span class="kw">assay</span>(umi.qc, <span class="st">"ruvg1"</span>) <-<span class="st"> </span><span class="kw">log2</span>(</a> -<a class="sourceLine" id="cb519-3" data-line-number="3"> <span class="kw">t</span>(<span class="kw">t</span>(ruvg<span class="op">$</span>normalizedCounts) <span class="op">/</span><span class="st"> </span><span class="kw">colSums</span>(ruvg<span class="op">$</span>normalizedCounts) <span class="op">*</span><span class="st"> </span><span class="fl">1e6</span>) <span class="op">+</span><span class="st"> </span><span class="dv">1</span></a> -<a class="sourceLine" id="cb519-4" data-line-number="4">)</a> -<a class="sourceLine" id="cb519-5" data-line-number="5">ruvg <-<span class="st"> </span><span class="kw">RUVg</span>(<span class="kw">counts</span>(umi.qc), erccs, <span class="dt">k =</span> <span class="dv">10</span>)</a> -<a class="sourceLine" id="cb519-6" data-line-number="6"><span class="kw">assay</span>(umi.qc, <span class="st">"ruvg10"</span>) <-<span class="st"> </span><span class="kw">log2</span>(</a> -<a class="sourceLine" id="cb519-7" data-line-number="7"> <span class="kw">t</span>(<span class="kw">t</span>(ruvg<span class="op">$</span>normalizedCounts) <span class="op">/</span><span class="st"> </span><span class="kw">colSums</span>(ruvg<span class="op">$</span>normalizedCounts) <span class="op">*</span><span class="st"> </span><span class="fl">1e6</span>) <span class="op">+</span><span class="st"> </span><span class="dv">1</span></a> -<a class="sourceLine" id="cb519-8" data-line-number="8">)</a></code></pre></div> +<div class="sourceCode" id="cb535"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb535-1" data-line-number="1">ruvg <-<span class="st"> </span><span class="kw">RUVg</span>(<span class="kw">counts</span>(umi.qc), erccs, <span class="dt">k =</span> <span class="dv">1</span>)</a> +<a class="sourceLine" id="cb535-2" data-line-number="2"><span class="kw">assay</span>(umi.qc, <span class="st">"ruvg1"</span>) <-<span class="st"> </span><span class="kw">log2</span>(</a> +<a class="sourceLine" id="cb535-3" data-line-number="3"> <span class="kw">t</span>(<span class="kw">t</span>(ruvg<span class="op">$</span>normalizedCounts) <span class="op">/</span><span class="st"> </span><span class="kw">colSums</span>(ruvg<span class="op">$</span>normalizedCounts) <span class="op">*</span><span class="st"> </span><span class="fl">1e6</span>) <span class="op">+</span><span class="st"> </span><span class="dv">1</span></a> +<a class="sourceLine" id="cb535-4" data-line-number="4">)</a> +<a class="sourceLine" id="cb535-5" data-line-number="5">ruvg <-<span class="st"> </span><span class="kw">RUVg</span>(<span class="kw">counts</span>(umi.qc), erccs, <span class="dt">k =</span> <span class="dv">10</span>)</a> +<a class="sourceLine" id="cb535-6" data-line-number="6"><span class="kw">assay</span>(umi.qc, <span class="st">"ruvg10"</span>) <-<span class="st"> </span><span class="kw">log2</span>(</a> +<a class="sourceLine" id="cb535-7" data-line-number="7"> <span class="kw">t</span>(<span class="kw">t</span>(ruvg<span class="op">$</span>normalizedCounts) <span class="op">/</span><span class="st"> </span><span class="kw">colSums</span>(ruvg<span class="op">$</span>normalizedCounts) <span class="op">*</span><span class="st"> </span><span class="fl">1e6</span>) <span class="op">+</span><span class="st"> </span><span class="dv">1</span></a> +<a class="sourceLine" id="cb535-8" data-line-number="8">)</a></code></pre></div> <p>When we assess the effectiveness of various batch correction methods below, you can discuss whether or not you think using ERCCs as negative control genes for a method like <em>RUVg</em> is advisable (in this dataset and in general).</p> @@ -5387,42 +6281,47 @@ the covariates of interest are constant.</p> represent normalized counts-per-million and then apply a log2 transformation. Again, we run the method with <span class="math inline">\(k=1\)</span> and <span class="math inline">\(k=10\)</span> so that we can compare the effect of estimating different number of hidden factors.</p> -<div class="sourceCode" id="cb520"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb520-1" data-line-number="1">scIdx <-<span class="st"> </span><span class="kw">matrix</span>(<span class="op">-</span><span class="dv">1</span>, <span class="dt">ncol =</span> <span class="kw">max</span>(<span class="kw">table</span>(umi.qc<span class="op">$</span>individual)), <span class="dt">nrow =</span> <span class="dv">3</span>)</a> -<a class="sourceLine" id="cb520-2" data-line-number="2">tmp <-<span class="st"> </span><span class="kw">which</span>(umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19098"</span>)</a> -<a class="sourceLine" id="cb520-3" data-line-number="3">scIdx[<span class="dv">1</span>, <span class="dv">1</span><span class="op">:</span><span class="kw">length</span>(tmp)] <-<span class="st"> </span>tmp</a> -<a class="sourceLine" id="cb520-4" data-line-number="4">tmp <-<span class="st"> </span><span class="kw">which</span>(umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19101"</span>)</a> -<a class="sourceLine" id="cb520-5" data-line-number="5">scIdx[<span class="dv">2</span>, <span class="dv">1</span><span class="op">:</span><span class="kw">length</span>(tmp)] <-<span class="st"> </span>tmp</a> -<a class="sourceLine" id="cb520-6" data-line-number="6">tmp <-<span class="st"> </span><span class="kw">which</span>(umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19239"</span>)</a> -<a class="sourceLine" id="cb520-7" data-line-number="7">scIdx[<span class="dv">3</span>, <span class="dv">1</span><span class="op">:</span><span class="kw">length</span>(tmp)] <-<span class="st"> </span>tmp</a> -<a class="sourceLine" id="cb520-8" data-line-number="8">cIdx <-<span class="st"> </span><span class="kw">rownames</span>(umi.qc)</a> -<a class="sourceLine" id="cb520-9" data-line-number="9">ruvs <-<span class="st"> </span><span class="kw">RUVs</span>(<span class="kw">counts</span>(umi.qc), cIdx, <span class="dt">k =</span> <span class="dv">1</span>, <span class="dt">scIdx =</span> scIdx, <span class="dt">isLog =</span> <span class="ot">FALSE</span>)</a> -<a class="sourceLine" id="cb520-10" data-line-number="10"><span class="kw">assay</span>(umi.qc, <span class="st">"ruvs1"</span>) <-<span class="st"> </span><span class="kw">log2</span>(</a> -<a class="sourceLine" id="cb520-11" data-line-number="11"> <span class="kw">t</span>(<span class="kw">t</span>(ruvs<span class="op">$</span>normalizedCounts) <span class="op">/</span><span class="st"> </span><span class="kw">colSums</span>(ruvs<span class="op">$</span>normalizedCounts) <span class="op">*</span><span class="st"> </span><span class="fl">1e6</span>) <span class="op">+</span><span class="st"> </span><span class="dv">1</span></a> -<a class="sourceLine" id="cb520-12" data-line-number="12">)</a> -<a class="sourceLine" id="cb520-13" data-line-number="13">ruvs <-<span class="st"> </span><span class="kw">RUVs</span>(<span class="kw">counts</span>(umi.qc), cIdx, <span class="dt">k =</span> <span class="dv">10</span>, <span class="dt">scIdx =</span> scIdx, <span class="dt">isLog =</span> <span class="ot">FALSE</span>)</a> -<a class="sourceLine" id="cb520-14" data-line-number="14"><span class="kw">assay</span>(umi.qc, <span class="st">"ruvs10"</span>) <-<span class="st"> </span><span class="kw">log2</span>(</a> -<a class="sourceLine" id="cb520-15" data-line-number="15"> <span class="kw">t</span>(<span class="kw">t</span>(ruvs<span class="op">$</span>normalizedCounts) <span class="op">/</span><span class="st"> </span><span class="kw">colSums</span>(ruvs<span class="op">$</span>normalizedCounts) <span class="op">*</span><span class="st"> </span><span class="fl">1e6</span>) <span class="op">+</span><span class="st"> </span><span class="dv">1</span></a> -<a class="sourceLine" id="cb520-16" data-line-number="16">)</a></code></pre></div> +<div class="sourceCode" id="cb536"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb536-1" data-line-number="1">scIdx <-<span class="st"> </span><span class="kw">matrix</span>(<span class="op">-</span><span class="dv">1</span>, <span class="dt">ncol =</span> <span class="kw">max</span>(<span class="kw">table</span>(umi.qc<span class="op">$</span>individual)), <span class="dt">nrow =</span> <span class="dv">3</span>)</a> +<a class="sourceLine" id="cb536-2" data-line-number="2">tmp <-<span class="st"> </span><span class="kw">which</span>(umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19098"</span>)</a> +<a class="sourceLine" id="cb536-3" data-line-number="3">scIdx[<span class="dv">1</span>, <span class="dv">1</span><span class="op">:</span><span class="kw">length</span>(tmp)] <-<span class="st"> </span>tmp</a> +<a class="sourceLine" id="cb536-4" data-line-number="4">tmp <-<span class="st"> </span><span class="kw">which</span>(umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19101"</span>)</a> +<a class="sourceLine" id="cb536-5" data-line-number="5">scIdx[<span class="dv">2</span>, <span class="dv">1</span><span class="op">:</span><span class="kw">length</span>(tmp)] <-<span class="st"> </span>tmp</a> +<a class="sourceLine" id="cb536-6" data-line-number="6">tmp <-<span class="st"> </span><span class="kw">which</span>(umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19239"</span>)</a> +<a class="sourceLine" id="cb536-7" data-line-number="7">scIdx[<span class="dv">3</span>, <span class="dv">1</span><span class="op">:</span><span class="kw">length</span>(tmp)] <-<span class="st"> </span>tmp</a> +<a class="sourceLine" id="cb536-8" data-line-number="8">cIdx <-<span class="st"> </span><span class="kw">rownames</span>(umi.qc)</a> +<a class="sourceLine" id="cb536-9" data-line-number="9">ruvs <-<span class="st"> </span><span class="kw">RUVs</span>(<span class="kw">counts</span>(umi.qc), cIdx, <span class="dt">k =</span> <span class="dv">1</span>, <span class="dt">scIdx =</span> scIdx, <span class="dt">isLog =</span> <span class="ot">FALSE</span>)</a> +<a class="sourceLine" id="cb536-10" data-line-number="10"><span class="kw">assay</span>(umi.qc, <span class="st">"ruvs1"</span>) <-<span class="st"> </span><span class="kw">log2</span>(</a> +<a class="sourceLine" id="cb536-11" data-line-number="11"> <span class="kw">t</span>(<span class="kw">t</span>(ruvs<span class="op">$</span>normalizedCounts) <span class="op">/</span><span class="st"> </span><span class="kw">colSums</span>(ruvs<span class="op">$</span>normalizedCounts) <span class="op">*</span><span class="st"> </span><span class="fl">1e6</span>) <span class="op">+</span><span class="st"> </span><span class="dv">1</span></a> +<a class="sourceLine" id="cb536-12" data-line-number="12">)</a> +<a class="sourceLine" id="cb536-13" data-line-number="13">ruvs <-<span class="st"> </span><span class="kw">RUVs</span>(<span class="kw">counts</span>(umi.qc), cIdx, <span class="dt">k =</span> <span class="dv">10</span>, <span class="dt">scIdx =</span> scIdx, <span class="dt">isLog =</span> <span class="ot">FALSE</span>)</a> +<a class="sourceLine" id="cb536-14" data-line-number="14"><span class="kw">assay</span>(umi.qc, <span class="st">"ruvs10"</span>) <-<span class="st"> </span><span class="kw">log2</span>(</a> +<a class="sourceLine" id="cb536-15" data-line-number="15"> <span class="kw">t</span>(<span class="kw">t</span>(ruvs<span class="op">$</span>normalizedCounts) <span class="op">/</span><span class="st"> </span><span class="kw">colSums</span>(ruvs<span class="op">$</span>normalizedCounts) <span class="op">*</span><span class="st"> </span><span class="fl">1e6</span>) <span class="op">+</span><span class="st"> </span><span class="dv">1</span></a> +<a class="sourceLine" id="cb536-16" data-line-number="16">)</a></code></pre></div> </div> </div> <div id="combat" class="section level3"> <h3><span class="header-section-number">7.6.5</span> Combat</h3> -<p>If you have an experiment with a balanced design, <code>Combat</code> can be used to eliminate batch effects while preserving biological effects by specifying the biological effects using the <code>mod</code> parameter. However the <code>Tung</code> data contains multiple experimental replicates rather than a balanced design so using <code>mod1</code> to preserve biological variability will result in an error.</p> -<div class="sourceCode" id="cb521"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb521-1" data-line-number="1">combat_data <-<span class="st"> </span><span class="kw">logcounts</span>(umi.qc)</a> -<a class="sourceLine" id="cb521-2" data-line-number="2">mod_data <-<span class="st"> </span><span class="kw">as.data.frame</span>(<span class="kw">t</span>(combat_data))</a> -<a class="sourceLine" id="cb521-3" data-line-number="3"><span class="co"># Basic batch removal</span></a> -<a class="sourceLine" id="cb521-4" data-line-number="4">mod0 <-<span class="st"> </span><span class="kw">model.matrix</span>(<span class="op">~</span><span class="st"> </span><span class="dv">1</span>, <span class="dt">data =</span> mod_data) </a> -<a class="sourceLine" id="cb521-5" data-line-number="5"><span class="co"># Preserve biological variability</span></a> -<a class="sourceLine" id="cb521-6" data-line-number="6">mod1 <-<span class="st"> </span><span class="kw">model.matrix</span>(<span class="op">~</span><span class="st"> </span>umi.qc<span class="op">$</span>individual, <span class="dt">data =</span> mod_data) </a> -<a class="sourceLine" id="cb521-7" data-line-number="7"><span class="co"># adjust for total genes detected</span></a> -<a class="sourceLine" id="cb521-8" data-line-number="8">mod2 <-<span class="st"> </span><span class="kw">model.matrix</span>(<span class="op">~</span><span class="st"> </span>umi.qc<span class="op">$</span>total_features_by_counts, <span class="dt">data =</span> mod_data)</a> -<a class="sourceLine" id="cb521-9" data-line-number="9"><span class="kw">assay</span>(umi.qc, <span class="st">"combat"</span>) <-<span class="st"> </span><span class="kw">ComBat</span>(</a> -<a class="sourceLine" id="cb521-10" data-line-number="10"> <span class="dt">dat =</span> <span class="kw">t</span>(mod_data), </a> -<a class="sourceLine" id="cb521-11" data-line-number="11"> <span class="dt">batch =</span> <span class="kw">factor</span>(umi.qc<span class="op">$</span>batch), </a> -<a class="sourceLine" id="cb521-12" data-line-number="12"> <span class="dt">mod =</span> mod0,</a> -<a class="sourceLine" id="cb521-13" data-line-number="13"> <span class="dt">par.prior =</span> <span class="ot">TRUE</span>,</a> -<a class="sourceLine" id="cb521-14" data-line-number="14"> <span class="dt">prior.plots =</span> <span class="ot">FALSE</span></a> -<a class="sourceLine" id="cb521-15" data-line-number="15">)</a></code></pre></div> +<p>If you have an experiment with a balanced design, <code>Combat</code> can be used to +eliminate batch effects while preserving biological effects by specifying the +biological effects using the <code>mod</code> parameter. However the <code>Tung</code> data contains +multiple experimental replicates rather than a balanced design so using <code>mod1</code> +to preserve biological variability will result in an error.</p> +<div class="sourceCode" id="cb537"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb537-1" data-line-number="1">combat_data <-<span class="st"> </span><span class="kw">logcounts</span>(umi.qc)</a> +<a class="sourceLine" id="cb537-2" data-line-number="2">mod_data <-<span class="st"> </span><span class="kw">as.data.frame</span>(<span class="kw">t</span>(combat_data))</a> +<a class="sourceLine" id="cb537-3" data-line-number="3"><span class="co"># Basic batch removal</span></a> +<a class="sourceLine" id="cb537-4" data-line-number="4">mod0 <-<span class="st"> </span><span class="kw">model.matrix</span>(<span class="op">~</span><span class="st"> </span><span class="dv">1</span>, <span class="dt">data =</span> mod_data) </a> +<a class="sourceLine" id="cb537-5" data-line-number="5"><span class="co"># Preserve biological variability</span></a> +<a class="sourceLine" id="cb537-6" data-line-number="6">mod1 <-<span class="st"> </span><span class="kw">model.matrix</span>(<span class="op">~</span><span class="st"> </span>umi.qc<span class="op">$</span>individual, <span class="dt">data =</span> mod_data) </a> +<a class="sourceLine" id="cb537-7" data-line-number="7"><span class="co"># adjust for total genes detected</span></a> +<a class="sourceLine" id="cb537-8" data-line-number="8">mod2 <-<span class="st"> </span><span class="kw">model.matrix</span>(<span class="op">~</span><span class="st"> </span>umi.qc<span class="op">$</span>total_features_by_counts, <span class="dt">data =</span> mod_data)</a> +<a class="sourceLine" id="cb537-9" data-line-number="9"><span class="kw">assay</span>(umi.qc, <span class="st">"combat"</span>) <-<span class="st"> </span><span class="kw">ComBat</span>(</a> +<a class="sourceLine" id="cb537-10" data-line-number="10"> <span class="dt">dat =</span> <span class="kw">t</span>(mod_data), </a> +<a class="sourceLine" id="cb537-11" data-line-number="11"> <span class="dt">batch =</span> <span class="kw">factor</span>(umi.qc<span class="op">$</span>batch), </a> +<a class="sourceLine" id="cb537-12" data-line-number="12"> <span class="dt">mod =</span> mod0,</a> +<a class="sourceLine" id="cb537-13" data-line-number="13"> <span class="dt">par.prior =</span> <span class="ot">TRUE</span>,</a> +<a class="sourceLine" id="cb537-14" data-line-number="14"> <span class="dt">prior.plots =</span> <span class="ot">FALSE</span></a> +<a class="sourceLine" id="cb537-15" data-line-number="15">)</a></code></pre></div> +<pre><code>## Standardizing Data across genes</code></pre> <p><strong>Exercise 1</strong></p> <p>Perform <code>ComBat</code> correction accounting for total features as a co-variate. Store the corrected matrix in the <code>combat_tf</code> slot.</p> </div> @@ -5436,47 +6335,48 @@ normalize each individual separately. Note that this will remove batch effects between batches within the same individual but not the batch effects between batches in different individuals, due to the confounded experimental design.</p> <p>Thus we will merge a replicate from each individual to form three batches.</p> -<div class="sourceCode" id="cb522"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb522-1" data-line-number="1">do_mnn <-<span class="st"> </span><span class="cf">function</span>(data.qc) {</a> -<a class="sourceLine" id="cb522-2" data-line-number="2"> batch1 <-<span class="st"> </span><span class="kw">logcounts</span>(data.qc[, data.qc<span class="op">$</span>replicate <span class="op">==</span><span class="st"> "r1"</span>])</a> -<a class="sourceLine" id="cb522-3" data-line-number="3"> batch2 <-<span class="st"> </span><span class="kw">logcounts</span>(data.qc[, data.qc<span class="op">$</span>replicate <span class="op">==</span><span class="st"> "r2"</span>])</a> -<a class="sourceLine" id="cb522-4" data-line-number="4"> batch3 <-<span class="st"> </span><span class="kw">logcounts</span>(data.qc[, data.qc<span class="op">$</span>replicate <span class="op">==</span><span class="st"> "r3"</span>])</a> -<a class="sourceLine" id="cb522-5" data-line-number="5"> </a> -<a class="sourceLine" id="cb522-6" data-line-number="6"> <span class="cf">if</span> (<span class="kw">ncol</span>(batch2) <span class="op">></span><span class="st"> </span><span class="dv">0</span>) {</a> -<a class="sourceLine" id="cb522-7" data-line-number="7"> x <-<span class="st"> </span>batchelor<span class="op">::</span><span class="kw">mnnCorrect</span>(</a> -<a class="sourceLine" id="cb522-8" data-line-number="8"> batch1, batch2, batch3, </a> -<a class="sourceLine" id="cb522-9" data-line-number="9"> <span class="dt">k =</span> <span class="dv">20</span>,</a> -<a class="sourceLine" id="cb522-10" data-line-number="10"> <span class="dt">sigma =</span> <span class="fl">0.1</span>,</a> -<a class="sourceLine" id="cb522-11" data-line-number="11"> <span class="dt">cos.norm.in =</span> <span class="ot">TRUE</span>,</a> -<a class="sourceLine" id="cb522-12" data-line-number="12"> <span class="dt">svd.dim =</span> <span class="dv">2</span></a> -<a class="sourceLine" id="cb522-13" data-line-number="13"> )</a> -<a class="sourceLine" id="cb522-14" data-line-number="14"> <span class="kw">return</span>(x)</a> -<a class="sourceLine" id="cb522-15" data-line-number="15"> } <span class="cf">else</span> {</a> -<a class="sourceLine" id="cb522-16" data-line-number="16"> x <-<span class="st"> </span>batchelor<span class="op">::</span><span class="kw">mnnCorrect</span>(</a> -<a class="sourceLine" id="cb522-17" data-line-number="17"> batch1, batch3, </a> -<a class="sourceLine" id="cb522-18" data-line-number="18"> <span class="dt">k =</span> <span class="dv">20</span>,</a> -<a class="sourceLine" id="cb522-19" data-line-number="19"> <span class="dt">sigma =</span> <span class="fl">0.1</span>,</a> -<a class="sourceLine" id="cb522-20" data-line-number="20"> <span class="dt">cos.norm.in =</span> <span class="ot">TRUE</span>,</a> -<a class="sourceLine" id="cb522-21" data-line-number="21"> <span class="dt">svd.dim =</span> <span class="dv">2</span></a> -<a class="sourceLine" id="cb522-22" data-line-number="22"> )</a> -<a class="sourceLine" id="cb522-23" data-line-number="23"> <span class="kw">return</span>(x)</a> -<a class="sourceLine" id="cb522-24" data-line-number="24"> }</a> -<a class="sourceLine" id="cb522-25" data-line-number="25">}</a> -<a class="sourceLine" id="cb522-26" data-line-number="26"></a> -<a class="sourceLine" id="cb522-27" data-line-number="27">indi1 <-<span class="st"> </span><span class="kw">do_mnn</span>(umi.qc[, umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19098"</span>])</a> -<a class="sourceLine" id="cb522-28" data-line-number="28">indi2 <-<span class="st"> </span><span class="kw">do_mnn</span>(umi.qc[, umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19101"</span>])</a> -<a class="sourceLine" id="cb522-29" data-line-number="29">indi3 <-<span class="st"> </span><span class="kw">do_mnn</span>(umi.qc[, umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19239"</span>])</a> -<a class="sourceLine" id="cb522-30" data-line-number="30"></a> -<a class="sourceLine" id="cb522-31" data-line-number="31"><span class="kw">identical</span>(<span class="kw">colnames</span>(umi.qc), <span class="kw">colnames</span>(<span class="kw">cbind</span>(indi1, indi2, indi3)))</a> -<a class="sourceLine" id="cb522-32" data-line-number="32"><span class="kw">assay</span>(umi.qc, <span class="st">"mnn"</span>) <-<span class="st"> </span><span class="kw">assay</span>(<span class="kw">cbind</span>(indi1, indi2, indi3), <span class="st">"corrected"</span>)</a> -<a class="sourceLine" id="cb522-33" data-line-number="33"></a> -<a class="sourceLine" id="cb522-34" data-line-number="34"><span class="co"># For a balanced design: </span></a> -<a class="sourceLine" id="cb522-35" data-line-number="35"><span class="co">#assay(umi.qc, "mnn") <- mnnCorrect(</span></a> -<a class="sourceLine" id="cb522-36" data-line-number="36"><span class="co"># list(B1 = logcounts(batch1), B2 = logcounts(batch2), B3 = logcounts(batch3)), </span></a> -<a class="sourceLine" id="cb522-37" data-line-number="37"><span class="co"># k = 20,</span></a> -<a class="sourceLine" id="cb522-38" data-line-number="38"><span class="co"># sigma = 0.1,</span></a> -<a class="sourceLine" id="cb522-39" data-line-number="39"><span class="co"># cos.norm = TRUE,</span></a> -<a class="sourceLine" id="cb522-40" data-line-number="40"><span class="co"># svd.dim = 2</span></a> -<a class="sourceLine" id="cb522-41" data-line-number="41"><span class="co">#)</span></a></code></pre></div> +<div class="sourceCode" id="cb539"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb539-1" data-line-number="1">do_mnn <-<span class="st"> </span><span class="cf">function</span>(data.qc) {</a> +<a class="sourceLine" id="cb539-2" data-line-number="2"> batch1 <-<span class="st"> </span><span class="kw">logcounts</span>(data.qc[, data.qc<span class="op">$</span>replicate <span class="op">==</span><span class="st"> "r1"</span>])</a> +<a class="sourceLine" id="cb539-3" data-line-number="3"> batch2 <-<span class="st"> </span><span class="kw">logcounts</span>(data.qc[, data.qc<span class="op">$</span>replicate <span class="op">==</span><span class="st"> "r2"</span>])</a> +<a class="sourceLine" id="cb539-4" data-line-number="4"> batch3 <-<span class="st"> </span><span class="kw">logcounts</span>(data.qc[, data.qc<span class="op">$</span>replicate <span class="op">==</span><span class="st"> "r3"</span>])</a> +<a class="sourceLine" id="cb539-5" data-line-number="5"> </a> +<a class="sourceLine" id="cb539-6" data-line-number="6"> <span class="cf">if</span> (<span class="kw">ncol</span>(batch2) <span class="op">></span><span class="st"> </span><span class="dv">0</span>) {</a> +<a class="sourceLine" id="cb539-7" data-line-number="7"> x <-<span class="st"> </span>batchelor<span class="op">::</span><span class="kw">mnnCorrect</span>(</a> +<a class="sourceLine" id="cb539-8" data-line-number="8"> batch1, batch2, batch3, </a> +<a class="sourceLine" id="cb539-9" data-line-number="9"> <span class="dt">k =</span> <span class="dv">20</span>,</a> +<a class="sourceLine" id="cb539-10" data-line-number="10"> <span class="dt">sigma =</span> <span class="fl">0.1</span>,</a> +<a class="sourceLine" id="cb539-11" data-line-number="11"> <span class="dt">cos.norm.in =</span> <span class="ot">TRUE</span>,</a> +<a class="sourceLine" id="cb539-12" data-line-number="12"> <span class="dt">svd.dim =</span> <span class="dv">2</span></a> +<a class="sourceLine" id="cb539-13" data-line-number="13"> )</a> +<a class="sourceLine" id="cb539-14" data-line-number="14"> <span class="kw">return</span>(x)</a> +<a class="sourceLine" id="cb539-15" data-line-number="15"> } <span class="cf">else</span> {</a> +<a class="sourceLine" id="cb539-16" data-line-number="16"> x <-<span class="st"> </span>batchelor<span class="op">::</span><span class="kw">mnnCorrect</span>(</a> +<a class="sourceLine" id="cb539-17" data-line-number="17"> batch1, batch3, </a> +<a class="sourceLine" id="cb539-18" data-line-number="18"> <span class="dt">k =</span> <span class="dv">20</span>,</a> +<a class="sourceLine" id="cb539-19" data-line-number="19"> <span class="dt">sigma =</span> <span class="fl">0.1</span>,</a> +<a class="sourceLine" id="cb539-20" data-line-number="20"> <span class="dt">cos.norm.in =</span> <span class="ot">TRUE</span>,</a> +<a class="sourceLine" id="cb539-21" data-line-number="21"> <span class="dt">svd.dim =</span> <span class="dv">2</span></a> +<a class="sourceLine" id="cb539-22" data-line-number="22"> )</a> +<a class="sourceLine" id="cb539-23" data-line-number="23"> <span class="kw">return</span>(x)</a> +<a class="sourceLine" id="cb539-24" data-line-number="24"> }</a> +<a class="sourceLine" id="cb539-25" data-line-number="25">}</a> +<a class="sourceLine" id="cb539-26" data-line-number="26"></a> +<a class="sourceLine" id="cb539-27" data-line-number="27">indi1 <-<span class="st"> </span><span class="kw">do_mnn</span>(umi.qc[, umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19098"</span>])</a> +<a class="sourceLine" id="cb539-28" data-line-number="28">indi2 <-<span class="st"> </span><span class="kw">do_mnn</span>(umi.qc[, umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19101"</span>])</a> +<a class="sourceLine" id="cb539-29" data-line-number="29">indi3 <-<span class="st"> </span><span class="kw">do_mnn</span>(umi.qc[, umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19239"</span>])</a> +<a class="sourceLine" id="cb539-30" data-line-number="30"></a> +<a class="sourceLine" id="cb539-31" data-line-number="31"><span class="kw">identical</span>(<span class="kw">colnames</span>(umi.qc), <span class="kw">colnames</span>(<span class="kw">cbind</span>(indi1, indi2, indi3)))</a></code></pre></div> +<pre><code>## [1] TRUE</code></pre> +<div class="sourceCode" id="cb541"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb541-1" data-line-number="1"><span class="kw">assay</span>(umi.qc, <span class="st">"mnn"</span>) <-<span class="st"> </span><span class="kw">assay</span>(<span class="kw">cbind</span>(indi1, indi2, indi3), <span class="st">"corrected"</span>)</a> +<a class="sourceLine" id="cb541-2" data-line-number="2"></a> +<a class="sourceLine" id="cb541-3" data-line-number="3"><span class="co"># For a balanced design: </span></a> +<a class="sourceLine" id="cb541-4" data-line-number="4"><span class="co">#assay(umi.qc, "mnn") <- mnnCorrect(</span></a> +<a class="sourceLine" id="cb541-5" data-line-number="5"><span class="co"># list(B1 = logcounts(batch1), B2 = logcounts(batch2), B3 = logcounts(batch3)), </span></a> +<a class="sourceLine" id="cb541-6" data-line-number="6"><span class="co"># k = 20,</span></a> +<a class="sourceLine" id="cb541-7" data-line-number="7"><span class="co"># sigma = 0.1,</span></a> +<a class="sourceLine" id="cb541-8" data-line-number="8"><span class="co"># cos.norm = TRUE,</span></a> +<a class="sourceLine" id="cb541-9" data-line-number="9"><span class="co"># svd.dim = 2</span></a> +<a class="sourceLine" id="cb541-10" data-line-number="10"><span class="co">#)</span></a></code></pre></div> <p>The latest version of the <a href="https://www.bioconductor.org/packages/release/bioc/html/batchelor.html"><code>batchelor</code></a> package has a new <code>fastMNN()</code> method. The <code>fastMNN()</code> function performs a @@ -5486,52 +6386,80 @@ advantages in speed and denoising. The function returns a <code>SingleCellExperi object containing a matrix of corrected PC scores, which can be used directly for downstream analyses like clustering and visualization. [NB: <code>fastMNN</code> may actually be slower on small datasets like that considered here.]</p> -<div class="sourceCode" id="cb523"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb523-1" data-line-number="1">indi1 <-<span class="st"> </span>batchelor<span class="op">::</span><span class="kw">fastMNN</span>(</a> -<a class="sourceLine" id="cb523-2" data-line-number="2"> umi.qc[, umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19098"</span>],</a> -<a class="sourceLine" id="cb523-3" data-line-number="3"> <span class="dt">batch =</span> umi.qc[, umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19098"</span>]<span class="op">$</span>replicate)</a> -<a class="sourceLine" id="cb523-4" data-line-number="4">indi2 <-<span class="st"> </span>batchelor<span class="op">::</span><span class="kw">fastMNN</span>(</a> -<a class="sourceLine" id="cb523-5" data-line-number="5"> umi.qc[, umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19101"</span>],</a> -<a class="sourceLine" id="cb523-6" data-line-number="6"> <span class="dt">batch =</span> umi.qc[, umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19101"</span>]<span class="op">$</span>replicate)</a> -<a class="sourceLine" id="cb523-7" data-line-number="7">indi3 <-<span class="st"> </span>batchelor<span class="op">::</span><span class="kw">fastMNN</span>(</a> -<a class="sourceLine" id="cb523-8" data-line-number="8"> umi.qc[, umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19239"</span>],</a> -<a class="sourceLine" id="cb523-9" data-line-number="9"> <span class="dt">batch =</span> umi.qc[, umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19239"</span>]<span class="op">$</span>replicate)</a> -<a class="sourceLine" id="cb523-10" data-line-number="10"></a> -<a class="sourceLine" id="cb523-11" data-line-number="11"><span class="kw">identical</span>(<span class="kw">colnames</span>(umi.qc),</a> -<a class="sourceLine" id="cb523-12" data-line-number="12"> <span class="kw">colnames</span>(<span class="kw">cbind</span>(<span class="kw">assay</span>(indi1, <span class="st">"reconstructed"</span>),</a> -<a class="sourceLine" id="cb523-13" data-line-number="13"> <span class="kw">assay</span>(indi2, <span class="st">"reconstructed"</span>),</a> -<a class="sourceLine" id="cb523-14" data-line-number="14"> <span class="kw">assay</span>(indi3, <span class="st">"reconstructed"</span>)))) </a> -<a class="sourceLine" id="cb523-15" data-line-number="15">fastmnn <-<span class="st"> </span><span class="kw">cbind</span>(<span class="kw">assay</span>(indi1, <span class="st">"reconstructed"</span>),</a> -<a class="sourceLine" id="cb523-16" data-line-number="16"> <span class="kw">assay</span>(indi2, <span class="st">"reconstructed"</span>),</a> -<a class="sourceLine" id="cb523-17" data-line-number="17"> <span class="kw">assay</span>(indi3, <span class="st">"reconstructed"</span>))</a> -<a class="sourceLine" id="cb523-18" data-line-number="18"><span class="kw">identical</span>(<span class="kw">rownames</span>(umi.qc), <span class="kw">rownames</span>(fastmnn))</a> -<a class="sourceLine" id="cb523-19" data-line-number="19"><span class="co">## fastMNN() drops 66 genes, so we cannot immediately add the reconstructed expression matrix to assays() in umi.qc</span></a> -<a class="sourceLine" id="cb523-20" data-line-number="20"><span class="co">## But we can run PCA on the reconstructed data from fastMNN() and add that to the reducedDim slot of our SCE object</span></a> -<a class="sourceLine" id="cb523-21" data-line-number="21">fastmnn_pca <-<span class="st"> </span><span class="kw">runPCA</span>(fastmnn)</a> -<a class="sourceLine" id="cb523-22" data-line-number="22"><span class="kw">reducedDim</span>(umi.qc, <span class="st">"fastmnn"</span>) <-<span class="st"> </span>fastmnn_pca</a></code></pre></div> +<div class="sourceCode" id="cb542"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb542-1" data-line-number="1">indi1 <-<span class="st"> </span>batchelor<span class="op">::</span><span class="kw">fastMNN</span>(</a> +<a class="sourceLine" id="cb542-2" data-line-number="2"> umi.qc[, umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19098"</span>],</a> +<a class="sourceLine" id="cb542-3" data-line-number="3"> <span class="dt">batch =</span> umi.qc[, umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19098"</span>]<span class="op">$</span>replicate)</a></code></pre></div> +<pre><code>## Warning in sweep(centered, 2, w, "/", check.margin = FALSE): 'check.margin' is ignored when 'x' is a DelayedArray object or +## derivative</code></pre> +<div class="sourceCode" id="cb544"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb544-1" data-line-number="1">indi2 <-<span class="st"> </span>batchelor<span class="op">::</span><span class="kw">fastMNN</span>(</a> +<a class="sourceLine" id="cb544-2" data-line-number="2"> umi.qc[, umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19101"</span>],</a> +<a class="sourceLine" id="cb544-3" data-line-number="3"> <span class="dt">batch =</span> umi.qc[, umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19101"</span>]<span class="op">$</span>replicate)</a></code></pre></div> +<pre><code>## Warning in sweep(centered, 2, w, "/", check.margin = FALSE): 'check.margin' is ignored when 'x' is a DelayedArray object or +## derivative</code></pre> +<div class="sourceCode" id="cb546"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb546-1" data-line-number="1">indi3 <-<span class="st"> </span>batchelor<span class="op">::</span><span class="kw">fastMNN</span>(</a> +<a class="sourceLine" id="cb546-2" data-line-number="2"> umi.qc[, umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19239"</span>],</a> +<a class="sourceLine" id="cb546-3" data-line-number="3"> <span class="dt">batch =</span> umi.qc[, umi.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19239"</span>]<span class="op">$</span>replicate)</a></code></pre></div> +<pre><code>## Warning in sweep(centered, 2, w, "/", check.margin = FALSE): 'check.margin' is ignored when 'x' is a DelayedArray object or +## derivative</code></pre> +<div class="sourceCode" id="cb548"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb548-1" data-line-number="1"><span class="kw">identical</span>(<span class="kw">colnames</span>(umi.qc),</a> +<a class="sourceLine" id="cb548-2" data-line-number="2"> <span class="kw">colnames</span>(<span class="kw">cbind</span>(<span class="kw">assay</span>(indi1, <span class="st">"reconstructed"</span>),</a> +<a class="sourceLine" id="cb548-3" data-line-number="3"> <span class="kw">assay</span>(indi2, <span class="st">"reconstructed"</span>),</a> +<a class="sourceLine" id="cb548-4" data-line-number="4"> <span class="kw">assay</span>(indi3, <span class="st">"reconstructed"</span>)))) </a></code></pre></div> +<pre><code>## [1] TRUE</code></pre> +<div class="sourceCode" id="cb550"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb550-1" data-line-number="1">fastmnn <-<span class="st"> </span><span class="kw">cbind</span>(<span class="kw">assay</span>(indi1, <span class="st">"reconstructed"</span>),</a> +<a class="sourceLine" id="cb550-2" data-line-number="2"> <span class="kw">assay</span>(indi2, <span class="st">"reconstructed"</span>),</a> +<a class="sourceLine" id="cb550-3" data-line-number="3"> <span class="kw">assay</span>(indi3, <span class="st">"reconstructed"</span>))</a> +<a class="sourceLine" id="cb550-4" data-line-number="4"><span class="kw">identical</span>(<span class="kw">rownames</span>(umi.qc), <span class="kw">rownames</span>(fastmnn))</a></code></pre></div> +<pre><code>## [1] FALSE</code></pre> +<div class="sourceCode" id="cb552"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb552-1" data-line-number="1"><span class="co">## fastMNN() drops 66 genes, so we cannot immediately add the reconstructed expression matrix to assays() in umi.qc</span></a> +<a class="sourceLine" id="cb552-2" data-line-number="2"><span class="co">## But we can run PCA on the reconstructed data from fastMNN() and add that to the reducedDim slot of our SCE object</span></a> +<a class="sourceLine" id="cb552-3" data-line-number="3">fastmnn_pca <-<span class="st"> </span><span class="kw">runPCA</span>(fastmnn, <span class="dt">rank=</span><span class="dv">2</span>)</a> +<a class="sourceLine" id="cb552-4" data-line-number="4"><span class="kw">reducedDim</span>(umi.qc, <span class="st">"fastmnn"</span>) <-<span class="st"> </span>fastmnn_pca<span class="op">$</span>rotation</a></code></pre></div> <p>For further details, please consult the <code>batchelor</code> package documentation and <a href="https://www.bioconductor.org/packages/release/bioc/vignettes/batchelor/inst/doc/correction.html">vignette</a>.</p> </div> <div id="harmony" class="section level3"> <h3><span class="header-section-number">7.6.7</span> Harmony</h3> -<p>Harmony [Korsunsky2018fast] is a newer batch correction method, which is designed to operate on PC space. The algorithm proceeds to iteratively cluster the cells, with the objective function formulated to promote cells from multiple datasets within each cluster. Once a clustering is obtained, the positions of the centroids of each dataset are obtained on a per-cluster basis and the coordinates are corrected. This procedure is iterated until convergence. Harmony comes with a <code>theta</code> parameter that controls the degree of batch correction (higher values lead to more dataset integration), and can account for multiple experimental and biological factors on input.</p> -<p>Seeing how the end result of Harmony is an altered dimensional reduction space created on the basis of PCA, we plot the obtained manifold here and exclude it from the rest of the follow-ups in the section.</p> -<div class="sourceCode" id="cb524"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb524-1" data-line-number="1">umi.qc.endog <-<span class="st"> </span>umi.qc[endog_genes,]</a> -<a class="sourceLine" id="cb524-2" data-line-number="2">umi.qc.endog <-<span class="st"> </span><span class="kw">runPCA</span>(umi.qc.endog, <span class="dt">exprs_values =</span> <span class="st">'logcounts'</span>, <span class="dt">ncomponents =</span> <span class="dv">20</span>)</a> -<a class="sourceLine" id="cb524-3" data-line-number="3">pca <-<span class="st"> </span><span class="kw">as.matrix</span>(<span class="kw">reducedDim</span>(umi.qc.endog, <span class="st">"PCA"</span>))</a> -<a class="sourceLine" id="cb524-4" data-line-number="4">harmony_emb <-<span class="st"> </span><span class="kw">HarmonyMatrix</span>(pca, umi.qc.endog<span class="op">$</span>batch, <span class="dt">theta=</span><span class="dv">2</span>, <span class="dt">do_pca=</span><span class="ot">FALSE</span>)</a> -<a class="sourceLine" id="cb524-5" data-line-number="5"><span class="kw">reducedDim</span>(umi.qc.endog, <span class="st">"harmony"</span>) <-<span class="st"> </span>harmony_emb</a> -<a class="sourceLine" id="cb524-6" data-line-number="6"></a> -<a class="sourceLine" id="cb524-7" data-line-number="7"><span class="kw">plotReducedDim</span>(</a> -<a class="sourceLine" id="cb524-8" data-line-number="8"> umi.qc.endog,</a> -<a class="sourceLine" id="cb524-9" data-line-number="9"> <span class="dt">use_dimred =</span> <span class="st">'harmony'</span>,</a> -<a class="sourceLine" id="cb524-10" data-line-number="10"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> -<a class="sourceLine" id="cb524-11" data-line-number="11"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb524-12" data-line-number="12"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> -<a class="sourceLine" id="cb524-13" data-line-number="13">)</a> -<a class="sourceLine" id="cb524-14" data-line-number="14"><span class="kw">reducedDim</span>(umi.qc, <span class="st">"harmony"</span>) <-<span class="st"> </span><span class="kw">reducedDim</span>(umi.qc.endog, <span class="st">"harmony"</span>)</a></code></pre></div> +<p>Harmony [Korsunsky2018fast] is a newer batch correction method, which is +designed to operate on PC space. The algorithm proceeds to iteratively cluster +the cells, with the objective function formulated to promote cells from multiple +datasets within each cluster. Once a clustering is obtained, the positions of +the centroids of each dataset are obtained on a per-cluster basis and the +coordinates are corrected. This procedure is iterated until convergence. Harmony +comes with a <code>theta</code> parameter that controls the degree of batch correction +(higher values lead to more dataset integration), and can account for multiple +experimental and biological factors on input.</p> +<p>Seeing how the end result of Harmony is an altered dimensional reduction space +created on the basis of PCA, we plot the obtained manifold here and exclude it +from the rest of the follow-ups in the section.</p> +<div class="sourceCode" id="cb553"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb553-1" data-line-number="1">umi.qc.endog <-<span class="st"> </span>umi.qc[endog_genes,]</a> +<a class="sourceLine" id="cb553-2" data-line-number="2">umi.qc.endog <-<span class="st"> </span><span class="kw">runPCA</span>(umi.qc.endog, <span class="dt">exprs_values =</span> <span class="st">'logcounts'</span>, <span class="dt">ncomponents =</span> <span class="dv">20</span>)</a> +<a class="sourceLine" id="cb553-3" data-line-number="3">pca <-<span class="st"> </span><span class="kw">as.matrix</span>(<span class="kw">reducedDim</span>(umi.qc.endog, <span class="st">"PCA"</span>))</a> +<a class="sourceLine" id="cb553-4" data-line-number="4">harmony_emb <-<span class="st"> </span><span class="kw">HarmonyMatrix</span>(pca, umi.qc.endog<span class="op">$</span>batch, <span class="dt">theta=</span><span class="dv">2</span>, <span class="dt">do_pca=</span><span class="ot">FALSE</span>)</a></code></pre></div> +<pre><code>## Harmony 1/10</code></pre> +<pre><code>## Harmony 2/10</code></pre> +<pre><code>## Harmony 3/10</code></pre> +<pre><code>## Harmony 4/10</code></pre> +<pre><code>## Harmony 5/10</code></pre> +<pre><code>## Harmony 6/10</code></pre> +<pre><code>## Harmony 7/10</code></pre> +<pre><code>## Harmony 8/10</code></pre> +<pre><code>## Harmony 9/10</code></pre> +<pre><code>## Harmony 10/10</code></pre> +<div class="sourceCode" id="cb564"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb564-1" data-line-number="1"><span class="kw">reducedDim</span>(umi.qc.endog, <span class="st">"harmony"</span>) <-<span class="st"> </span>harmony_emb</a> +<a class="sourceLine" id="cb564-2" data-line-number="2"></a> +<a class="sourceLine" id="cb564-3" data-line-number="3"><span class="kw">plotReducedDim</span>(</a> +<a class="sourceLine" id="cb564-4" data-line-number="4"> umi.qc.endog,</a> +<a class="sourceLine" id="cb564-5" data-line-number="5"> <span class="dt">use_dimred =</span> <span class="st">'harmony'</span>,</a> +<a class="sourceLine" id="cb564-6" data-line-number="6"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> +<a class="sourceLine" id="cb564-7" data-line-number="7"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb564-8" data-line-number="8"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> +<a class="sourceLine" id="cb564-9" data-line-number="9">)</a></code></pre></div> +<p><img src="remove-conf_files/figure-html/harmony-1.png" width="90%" style="display: block; margin: auto;" /></p> +<div class="sourceCode" id="cb565"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb565-1" data-line-number="1"><span class="kw">reducedDim</span>(umi.qc, <span class="st">"harmony"</span>) <-<span class="st"> </span><span class="kw">reducedDim</span>(umi.qc.endog, <span class="st">"harmony"</span>)</a></code></pre></div> </div> -<div id="how-to-evaluate-and-compare-confounder-removal-strategies" class="section level3"> -<h3><span class="header-section-number">7.6.8</span> How to evaluate and compare confounder removal strategies</h3> +<div id="how-to-evaluate-and-compare-batch-correction" class="section level3"> +<h3><span class="header-section-number">7.6.8</span> How to evaluate and compare batch correction</h3> <p>A key question when considering the different methods for removing confounders is how to quantitatively determine which one is the most effective. The main reason why comparisons are challenging is because it is often difficult to know @@ -5548,30 +6476,39 @@ PCA plot where colour corresponds the technical replicates and shape corresponds to different biological samples (individuals). Separation of biological samples and interspersed batches indicates that technical variation has been removed. We always use log2-cpm normalized data to match the assumptions of PCA.</p> -<div class="sourceCode" id="cb525"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb525-1" data-line-number="1"><span class="cf">for</span> (nm <span class="cf">in</span> <span class="kw">assayNames</span>(umi.qc)) {</a> -<a class="sourceLine" id="cb525-2" data-line-number="2"> <span class="kw">cat</span>(nm, <span class="st">" </span><span class="ch">\n</span><span class="st">"</span>)</a> -<a class="sourceLine" id="cb525-3" data-line-number="3"> tmp <-<span class="st"> </span><span class="kw">runPCA</span>(</a> -<a class="sourceLine" id="cb525-4" data-line-number="4"> umi.qc[endog_genes, ],</a> -<a class="sourceLine" id="cb525-5" data-line-number="5"> <span class="dt">exprs_values =</span> nm</a> -<a class="sourceLine" id="cb525-6" data-line-number="6"> )</a> -<a class="sourceLine" id="cb525-7" data-line-number="7"> <span class="kw">reducedDim</span>(umi.qc, <span class="kw">paste0</span>(<span class="st">"PCA_"</span>, nm)) <-<span class="st"> </span><span class="kw">reducedDim</span>(tmp, <span class="st">"PCA"</span>)</a> -<a class="sourceLine" id="cb525-8" data-line-number="8">}</a> -<a class="sourceLine" id="cb525-9" data-line-number="9"></a> -<a class="sourceLine" id="cb525-10" data-line-number="10"></a> -<a class="sourceLine" id="cb525-11" data-line-number="11"><span class="cf">for</span> (nm <span class="cf">in</span> <span class="kw">reducedDimNames</span>(umi.qc)) {</a> -<a class="sourceLine" id="cb525-12" data-line-number="12"> <span class="kw">print</span>(</a> -<a class="sourceLine" id="cb525-13" data-line-number="13"> <span class="kw">plotReducedDim</span>(</a> -<a class="sourceLine" id="cb525-14" data-line-number="14"> umi.qc,</a> -<a class="sourceLine" id="cb525-15" data-line-number="15"> <span class="dt">use_dimred =</span> nm,</a> -<a class="sourceLine" id="cb525-16" data-line-number="16"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> -<a class="sourceLine" id="cb525-17" data-line-number="17"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb525-18" data-line-number="18"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> -<a class="sourceLine" id="cb525-19" data-line-number="19"> ) <span class="op">+</span></a> -<a class="sourceLine" id="cb525-20" data-line-number="20"><span class="st"> </span><span class="kw">ggtitle</span>(nm)</a> -<a class="sourceLine" id="cb525-21" data-line-number="21"> )</a> -<a class="sourceLine" id="cb525-22" data-line-number="22">}</a></code></pre></div> +<div class="sourceCode" id="cb566"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb566-1" data-line-number="1"><span class="cf">for</span> (nm <span class="cf">in</span> <span class="kw">assayNames</span>(umi.qc)) {</a> +<a class="sourceLine" id="cb566-2" data-line-number="2"> <span class="kw">cat</span>(nm, <span class="st">" </span><span class="ch">\n</span><span class="st">"</span>)</a> +<a class="sourceLine" id="cb566-3" data-line-number="3"> tmp <-<span class="st"> </span><span class="kw">runPCA</span>(</a> +<a class="sourceLine" id="cb566-4" data-line-number="4"> umi.qc[endog_genes, ],</a> +<a class="sourceLine" id="cb566-5" data-line-number="5"> <span class="dt">exprs_values =</span> nm</a> +<a class="sourceLine" id="cb566-6" data-line-number="6"> )</a> +<a class="sourceLine" id="cb566-7" data-line-number="7"> <span class="kw">reducedDim</span>(umi.qc, <span class="kw">paste0</span>(<span class="st">"PCA_"</span>, nm)) <-<span class="st"> </span><span class="kw">reducedDim</span>(tmp, <span class="st">"PCA"</span>)</a> +<a class="sourceLine" id="cb566-8" data-line-number="8">}</a></code></pre></div> +<pre><code>## counts +## logcounts_raw +## logcounts +## sctrans_norm +## ruvg1 +## ruvg10 +## ruvs1 +## ruvs10 +## combat +## mnn</code></pre> +<div class="sourceCode" id="cb568"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb568-1" data-line-number="1"><span class="cf">for</span> (nm <span class="cf">in</span> <span class="kw">reducedDimNames</span>(umi.qc)) {</a> +<a class="sourceLine" id="cb568-2" data-line-number="2"> <span class="kw">print</span>(</a> +<a class="sourceLine" id="cb568-3" data-line-number="3"> <span class="kw">plotReducedDim</span>(</a> +<a class="sourceLine" id="cb568-4" data-line-number="4"> umi.qc,</a> +<a class="sourceLine" id="cb568-5" data-line-number="5"> <span class="dt">use_dimred =</span> nm,</a> +<a class="sourceLine" id="cb568-6" data-line-number="6"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> +<a class="sourceLine" id="cb568-7" data-line-number="7"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb568-8" data-line-number="8"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> +<a class="sourceLine" id="cb568-9" data-line-number="9"> ) <span class="op">+</span></a> +<a class="sourceLine" id="cb568-10" data-line-number="10"><span class="st"> </span><span class="kw">ggtitle</span>(nm)</a> +<a class="sourceLine" id="cb568-11" data-line-number="11"> )</a> +<a class="sourceLine" id="cb568-12" data-line-number="12">}</a></code></pre></div> +<p><img src="remove-conf_files/figure-html/effectiveness1-1.png" width="90%" style="display: block; margin: auto;" /><img src="remove-conf_files/figure-html/effectiveness1-2.png" width="90%" style="display: block; margin: auto;" /><img src="remove-conf_files/figure-html/effectiveness1-3.png" width="90%" style="display: block; margin: auto;" /><img src="remove-conf_files/figure-html/effectiveness1-4.png" width="90%" style="display: block; margin: auto;" /><img src="remove-conf_files/figure-html/effectiveness1-5.png" width="90%" style="display: block; margin: auto;" /><img src="remove-conf_files/figure-html/effectiveness1-6.png" width="90%" style="display: block; margin: auto;" /><img src="remove-conf_files/figure-html/effectiveness1-7.png" width="90%" style="display: block; margin: auto;" /><img src="remove-conf_files/figure-html/effectiveness1-8.png" width="90%" style="display: block; margin: auto;" /><img src="remove-conf_files/figure-html/effectiveness1-9.png" width="90%" style="display: block; margin: auto;" /><img src="remove-conf_files/figure-html/effectiveness1-10.png" width="90%" style="display: block; margin: auto;" /><img src="remove-conf_files/figure-html/effectiveness1-11.png" width="90%" style="display: block; margin: auto;" /><img src="remove-conf_files/figure-html/effectiveness1-12.png" width="90%" style="display: block; margin: auto;" /></p> <p><strong>Exercise 3</strong></p> -<p>Consider different <code>ks</code> for RUV normalizations. Which gives the best results?</p> +<p>Consider different <code>k</code>’s for RUV normalizations. Which gives the best results?</p> </div> <div id="effectiveness-2" class="section level4"> <h4><span class="header-section-number">7.6.8.2</span> Effectiveness 2</h4> @@ -5580,12 +6517,13 @@ expression (RLE) across cells to confirm technical noise has been removed from the dataset. Note RLE only evaluates whether the number of genes higher and lower than average are equal for each cell - i.e. systemic technical effects. Random technical noise between batches may not be detected by RLE.</p> -<div class="sourceCode" id="cb526"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb526-1" data-line-number="1">res <-<span class="st"> </span><span class="kw">list</span>()</a> -<a class="sourceLine" id="cb526-2" data-line-number="2"><span class="cf">for</span>(n <span class="cf">in</span> <span class="kw">assayNames</span>(umi.qc)) {</a> -<a class="sourceLine" id="cb526-3" data-line-number="3"> res[[n]] <-<span class="st"> </span><span class="kw">suppressWarnings</span>(<span class="kw">calc_cell_RLE</span>(<span class="kw">assay</span>(umi.qc, n), erccs))</a> -<a class="sourceLine" id="cb526-4" data-line-number="4">}</a> -<a class="sourceLine" id="cb526-5" data-line-number="5"><span class="kw">par</span>(<span class="dt">mar=</span><span class="kw">c</span>(<span class="dv">6</span>,<span class="dv">4</span>,<span class="dv">1</span>,<span class="dv">1</span>))</a> -<a class="sourceLine" id="cb526-6" data-line-number="6"><span class="kw">boxplot</span>(res, <span class="dt">las=</span><span class="dv">2</span>)</a></code></pre></div> +<div class="sourceCode" id="cb569"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb569-1" data-line-number="1">res <-<span class="st"> </span><span class="kw">list</span>()</a> +<a class="sourceLine" id="cb569-2" data-line-number="2"><span class="cf">for</span>(n <span class="cf">in</span> <span class="kw">assayNames</span>(umi.qc)) {</a> +<a class="sourceLine" id="cb569-3" data-line-number="3"> res[[n]] <-<span class="st"> </span><span class="kw">suppressWarnings</span>(<span class="kw">calc_cell_RLE</span>(<span class="kw">assay</span>(umi.qc, n), erccs))</a> +<a class="sourceLine" id="cb569-4" data-line-number="4">}</a> +<a class="sourceLine" id="cb569-5" data-line-number="5"><span class="kw">par</span>(<span class="dt">mar=</span><span class="kw">c</span>(<span class="dv">6</span>,<span class="dv">4</span>,<span class="dv">1</span>,<span class="dv">1</span>))</a> +<a class="sourceLine" id="cb569-6" data-line-number="6"><span class="kw">boxplot</span>(res, <span class="dt">las=</span><span class="dv">2</span>)</a></code></pre></div> +<p><img src="remove-conf_files/figure-html/effectiveness2-1.png" width="90%" style="display: block; margin: auto;" /></p> </div> <div id="effectiveness-3" class="section level4"> <h4><span class="header-section-number">7.6.8.3</span> Effectiveness 3</h4> @@ -5606,273 +6544,373 @@ individual independently to check for residual batch effects. However, this method will not identify residual batch-effects which are confounded with biological conditions. In addition, <code>kBET</code> does not determine if biological signal has been preserved.</p> -<div class="sourceCode" id="cb527"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb527-1" data-line-number="1">compare_kBET_results <-<span class="st"> </span><span class="cf">function</span>(sce){</a> -<a class="sourceLine" id="cb527-2" data-line-number="2"> indiv <-<span class="st"> </span><span class="kw">unique</span>(sce<span class="op">$</span>individual)</a> -<a class="sourceLine" id="cb527-3" data-line-number="3"> norms <-<span class="st"> </span><span class="kw">assayNames</span>(sce) <span class="co"># Get all normalizations</span></a> -<a class="sourceLine" id="cb527-4" data-line-number="4"> results <-<span class="st"> </span><span class="kw">list</span>()</a> -<a class="sourceLine" id="cb527-5" data-line-number="5"> <span class="cf">for</span> (i <span class="cf">in</span> indiv){ </a> -<a class="sourceLine" id="cb527-6" data-line-number="6"> <span class="cf">for</span> (j <span class="cf">in</span> norms){</a> -<a class="sourceLine" id="cb527-7" data-line-number="7"> tmp <-<span class="st"> </span><span class="kw">kBET</span>(</a> -<a class="sourceLine" id="cb527-8" data-line-number="8"> <span class="dt">df =</span> <span class="kw">t</span>(<span class="kw">assay</span>(sce[,sce<span class="op">$</span>individual<span class="op">==</span><span class="st"> </span>i], j)), </a> -<a class="sourceLine" id="cb527-9" data-line-number="9"> <span class="dt">batch =</span> sce<span class="op">$</span>batch[sce<span class="op">$</span>individual<span class="op">==</span>i], </a> -<a class="sourceLine" id="cb527-10" data-line-number="10"> <span class="dt">heuristic =</span> <span class="ot">TRUE</span>, </a> -<a class="sourceLine" id="cb527-11" data-line-number="11"> <span class="dt">verbose =</span> <span class="ot">FALSE</span>, </a> -<a class="sourceLine" id="cb527-12" data-line-number="12"> <span class="dt">addTest =</span> <span class="ot">FALSE</span>, </a> -<a class="sourceLine" id="cb527-13" data-line-number="13"> <span class="dt">plot =</span> <span class="ot">FALSE</span>)</a> -<a class="sourceLine" id="cb527-14" data-line-number="14"> results[[i]][[j]] <-<span class="st"> </span>tmp<span class="op">$</span>summary<span class="op">$</span>kBET.observed[<span class="dv">1</span>]</a> -<a class="sourceLine" id="cb527-15" data-line-number="15"> }</a> -<a class="sourceLine" id="cb527-16" data-line-number="16"> }</a> -<a class="sourceLine" id="cb527-17" data-line-number="17"> <span class="kw">return</span>(<span class="kw">as.data.frame</span>(results))</a> -<a class="sourceLine" id="cb527-18" data-line-number="18">}</a> -<a class="sourceLine" id="cb527-19" data-line-number="19"></a> -<a class="sourceLine" id="cb527-20" data-line-number="20">eff_debatching <-<span class="st"> </span><span class="kw">compare_kBET_results</span>(umi.qc)</a></code></pre></div> -<div class="sourceCode" id="cb528"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb528-1" data-line-number="1"><span class="kw">require</span>(<span class="st">"reshape2"</span>)</a> -<a class="sourceLine" id="cb528-2" data-line-number="2"><span class="kw">require</span>(<span class="st">"RColorBrewer"</span>)</a> -<a class="sourceLine" id="cb528-3" data-line-number="3"><span class="co"># Plot results</span></a> -<a class="sourceLine" id="cb528-4" data-line-number="4">dod <-<span class="st"> </span><span class="kw">melt</span>(<span class="kw">as.matrix</span>(eff_debatching), <span class="dt">value.name =</span> <span class="st">"kBET"</span>)</a> -<a class="sourceLine" id="cb528-5" data-line-number="5"><span class="kw">colnames</span>(dod)[<span class="dv">1</span><span class="op">:</span><span class="dv">2</span>] <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"Normalisation"</span>, <span class="st">"Individual"</span>)</a> -<a class="sourceLine" id="cb528-6" data-line-number="6"></a> -<a class="sourceLine" id="cb528-7" data-line-number="7">colorset <-<span class="st"> </span><span class="kw">c</span>(<span class="st">'gray'</span>, <span class="kw">brewer.pal</span>(<span class="dt">n =</span> <span class="dv">9</span>, <span class="st">"Oranges"</span>))</a> -<a class="sourceLine" id="cb528-8" data-line-number="8"></a> -<a class="sourceLine" id="cb528-9" data-line-number="9"><span class="kw">ggplot</span>(dod, <span class="kw">aes</span>(Normalisation, Individual, <span class="dt">fill=</span>kBET)) <span class="op">+</span><span class="st"> </span></a> -<a class="sourceLine" id="cb528-10" data-line-number="10"><span class="st"> </span><span class="kw">geom_tile</span>() <span class="op">+</span></a> -<a class="sourceLine" id="cb528-11" data-line-number="11"><span class="st"> </span><span class="kw">scale_fill_gradient2</span>(</a> -<a class="sourceLine" id="cb528-12" data-line-number="12"> <span class="dt">na.value =</span> <span class="st">"gray"</span>,</a> -<a class="sourceLine" id="cb528-13" data-line-number="13"> <span class="dt">low =</span> colorset[<span class="dv">2</span>],</a> -<a class="sourceLine" id="cb528-14" data-line-number="14"> <span class="dt">mid=</span>colorset[<span class="dv">6</span>],</a> -<a class="sourceLine" id="cb528-15" data-line-number="15"> <span class="dt">high =</span> colorset[<span class="dv">10</span>],</a> -<a class="sourceLine" id="cb528-16" data-line-number="16"> <span class="dt">midpoint =</span> <span class="fl">0.5</span>, <span class="dt">limit =</span> <span class="kw">c</span>(<span class="dv">0</span>,<span class="dv">1</span>)) <span class="op">+</span></a> -<a class="sourceLine" id="cb528-17" data-line-number="17"><span class="st"> </span><span class="kw">scale_x_discrete</span>(<span class="dt">expand =</span> <span class="kw">c</span>(<span class="dv">0</span>, <span class="dv">0</span>)) <span class="op">+</span></a> -<a class="sourceLine" id="cb528-18" data-line-number="18"><span class="st"> </span><span class="kw">scale_y_discrete</span>(<span class="dt">expand =</span> <span class="kw">c</span>(<span class="dv">0</span>, <span class="dv">0</span>)) <span class="op">+</span><span class="st"> </span></a> -<a class="sourceLine" id="cb528-19" data-line-number="19"><span class="st"> </span><span class="kw">theme</span>(</a> -<a class="sourceLine" id="cb528-20" data-line-number="20"> <span class="dt">axis.text.x =</span> <span class="kw">element_text</span>(</a> -<a class="sourceLine" id="cb528-21" data-line-number="21"> <span class="dt">angle =</span> <span class="dv">45</span>, </a> -<a class="sourceLine" id="cb528-22" data-line-number="22"> <span class="dt">vjust =</span> <span class="dv">1</span>, </a> -<a class="sourceLine" id="cb528-23" data-line-number="23"> <span class="dt">size =</span> <span class="dv">12</span>, </a> -<a class="sourceLine" id="cb528-24" data-line-number="24"> <span class="dt">hjust =</span> <span class="dv">1</span></a> -<a class="sourceLine" id="cb528-25" data-line-number="25"> )</a> -<a class="sourceLine" id="cb528-26" data-line-number="26"> ) <span class="op">+</span><span class="st"> </span></a> -<a class="sourceLine" id="cb528-27" data-line-number="27"><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"Effect of batch regression methods per individual"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb570"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb570-1" data-line-number="1">compare_kBET_results <-<span class="st"> </span><span class="cf">function</span>(sce){</a> +<a class="sourceLine" id="cb570-2" data-line-number="2"> indiv <-<span class="st"> </span><span class="kw">unique</span>(sce<span class="op">$</span>individual)</a> +<a class="sourceLine" id="cb570-3" data-line-number="3"> norms <-<span class="st"> </span><span class="kw">assayNames</span>(sce) <span class="co"># Get all normalizations</span></a> +<a class="sourceLine" id="cb570-4" data-line-number="4"> results <-<span class="st"> </span><span class="kw">list</span>()</a> +<a class="sourceLine" id="cb570-5" data-line-number="5"> <span class="cf">for</span> (i <span class="cf">in</span> indiv){ </a> +<a class="sourceLine" id="cb570-6" data-line-number="6"> <span class="cf">for</span> (j <span class="cf">in</span> norms){</a> +<a class="sourceLine" id="cb570-7" data-line-number="7"> tmp <-<span class="st"> </span><span class="kw">kBET</span>(</a> +<a class="sourceLine" id="cb570-8" data-line-number="8"> <span class="dt">df =</span> <span class="kw">t</span>(<span class="kw">assay</span>(sce[,sce<span class="op">$</span>individual<span class="op">==</span><span class="st"> </span>i], j)), </a> +<a class="sourceLine" id="cb570-9" data-line-number="9"> <span class="dt">batch =</span> sce<span class="op">$</span>batch[sce<span class="op">$</span>individual<span class="op">==</span>i], </a> +<a class="sourceLine" id="cb570-10" data-line-number="10"> <span class="dt">heuristic =</span> <span class="ot">TRUE</span>, </a> +<a class="sourceLine" id="cb570-11" data-line-number="11"> <span class="dt">verbose =</span> <span class="ot">FALSE</span>, </a> +<a class="sourceLine" id="cb570-12" data-line-number="12"> <span class="dt">addTest =</span> <span class="ot">FALSE</span>, </a> +<a class="sourceLine" id="cb570-13" data-line-number="13"> <span class="dt">plot =</span> <span class="ot">FALSE</span>)</a> +<a class="sourceLine" id="cb570-14" data-line-number="14"> results[[i]][[j]] <-<span class="st"> </span>tmp<span class="op">$</span>summary<span class="op">$</span>kBET.observed[<span class="dv">1</span>]</a> +<a class="sourceLine" id="cb570-15" data-line-number="15"> }</a> +<a class="sourceLine" id="cb570-16" data-line-number="16"> }</a> +<a class="sourceLine" id="cb570-17" data-line-number="17"> <span class="kw">return</span>(<span class="kw">as.data.frame</span>(results))</a> +<a class="sourceLine" id="cb570-18" data-line-number="18">}</a> +<a class="sourceLine" id="cb570-19" data-line-number="19"></a> +<a class="sourceLine" id="cb570-20" data-line-number="20">eff_debatching <-<span class="st"> </span><span class="kw">compare_kBET_results</span>(umi.qc)</a></code></pre></div> +<div class="sourceCode" id="cb571"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb571-1" data-line-number="1"><span class="kw">require</span>(<span class="st">"reshape2"</span>)</a> +<a class="sourceLine" id="cb571-2" data-line-number="2"><span class="kw">require</span>(<span class="st">"RColorBrewer"</span>)</a> +<a class="sourceLine" id="cb571-3" data-line-number="3"><span class="co"># Plot results</span></a> +<a class="sourceLine" id="cb571-4" data-line-number="4">dod <-<span class="st"> </span><span class="kw">melt</span>(<span class="kw">as.matrix</span>(eff_debatching), <span class="dt">value.name =</span> <span class="st">"kBET"</span>)</a> +<a class="sourceLine" id="cb571-5" data-line-number="5"><span class="kw">colnames</span>(dod)[<span class="dv">1</span><span class="op">:</span><span class="dv">2</span>] <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"Normalisation"</span>, <span class="st">"Individual"</span>)</a> +<a class="sourceLine" id="cb571-6" data-line-number="6"></a> +<a class="sourceLine" id="cb571-7" data-line-number="7">colorset <-<span class="st"> </span><span class="kw">c</span>(<span class="st">'gray'</span>, <span class="kw">brewer.pal</span>(<span class="dt">n =</span> <span class="dv">9</span>, <span class="st">"Oranges"</span>))</a> +<a class="sourceLine" id="cb571-8" data-line-number="8"></a> +<a class="sourceLine" id="cb571-9" data-line-number="9"><span class="kw">ggplot</span>(dod, <span class="kw">aes</span>(Normalisation, Individual, <span class="dt">fill=</span>kBET)) <span class="op">+</span><span class="st"> </span></a> +<a class="sourceLine" id="cb571-10" data-line-number="10"><span class="st"> </span><span class="kw">geom_tile</span>() <span class="op">+</span></a> +<a class="sourceLine" id="cb571-11" data-line-number="11"><span class="st"> </span><span class="kw">scale_fill_gradient2</span>(</a> +<a class="sourceLine" id="cb571-12" data-line-number="12"> <span class="dt">na.value =</span> <span class="st">"gray"</span>,</a> +<a class="sourceLine" id="cb571-13" data-line-number="13"> <span class="dt">low =</span> colorset[<span class="dv">2</span>],</a> +<a class="sourceLine" id="cb571-14" data-line-number="14"> <span class="dt">mid=</span>colorset[<span class="dv">6</span>],</a> +<a class="sourceLine" id="cb571-15" data-line-number="15"> <span class="dt">high =</span> colorset[<span class="dv">10</span>],</a> +<a class="sourceLine" id="cb571-16" data-line-number="16"> <span class="dt">midpoint =</span> <span class="fl">0.5</span>, <span class="dt">limit =</span> <span class="kw">c</span>(<span class="dv">0</span>,<span class="dv">1</span>)) <span class="op">+</span></a> +<a class="sourceLine" id="cb571-17" data-line-number="17"><span class="st"> </span><span class="kw">scale_x_discrete</span>(<span class="dt">expand =</span> <span class="kw">c</span>(<span class="dv">0</span>, <span class="dv">0</span>)) <span class="op">+</span></a> +<a class="sourceLine" id="cb571-18" data-line-number="18"><span class="st"> </span><span class="kw">scale_y_discrete</span>(<span class="dt">expand =</span> <span class="kw">c</span>(<span class="dv">0</span>, <span class="dv">0</span>)) <span class="op">+</span><span class="st"> </span></a> +<a class="sourceLine" id="cb571-19" data-line-number="19"><span class="st"> </span><span class="kw">theme</span>(</a> +<a class="sourceLine" id="cb571-20" data-line-number="20"> <span class="dt">axis.text.x =</span> <span class="kw">element_text</span>(</a> +<a class="sourceLine" id="cb571-21" data-line-number="21"> <span class="dt">angle =</span> <span class="dv">45</span>, </a> +<a class="sourceLine" id="cb571-22" data-line-number="22"> <span class="dt">vjust =</span> <span class="dv">1</span>, </a> +<a class="sourceLine" id="cb571-23" data-line-number="23"> <span class="dt">size =</span> <span class="dv">12</span>, </a> +<a class="sourceLine" id="cb571-24" data-line-number="24"> <span class="dt">hjust =</span> <span class="dv">1</span></a> +<a class="sourceLine" id="cb571-25" data-line-number="25"> )</a> +<a class="sourceLine" id="cb571-26" data-line-number="26"> ) <span class="op">+</span><span class="st"> </span></a> +<a class="sourceLine" id="cb571-27" data-line-number="27"><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"Effect of batch regression methods per individual"</span>)</a></code></pre></div> +<p><img src="remove-conf_files/figure-html/kbet-1.png" width="90%" style="display: block; margin: auto;" /></p> <p><strong>Exercise 4</strong></p> <p>Why do the raw counts appear to have little batch effects?</p> </div> </div> <div id="big-exercise-2" class="section level3"> <h3><span class="header-section-number">7.6.9</span> Big Exercise</h3> -<p>Perform the same analysis with read counts of the <code>tung</code> data. Use <code>tung/reads.rds</code> file to load the reads <code>SCE</code> object. Once you have finished please compare your results to ours (next chapter). Additionally, experiment with other combinations of normalizations and compare the results.</p> +<p>Perform the same analysis with read counts of the <code>tung</code> data. Use +<code>tung/reads.rds</code> file to load the reads <code>SCE</code> object. Once you have finished +please compare your results to ours (next chapter). Additionally, experiment +with other combinations of normalizations and compare the results.</p> </div> <div id="sessioninfo-4" class="section level3"> <h3><span class="header-section-number">7.6.10</span> sessionInfo()</h3> +<pre><code>## R version 3.6.0 (2019-04-26) +## Platform: x86_64-pc-linux-gnu (64-bit) +## Running under: Ubuntu 18.04.3 LTS +## +## Matrix products: default +## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 +## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 +## +## locale: +## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C +## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 +## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 +## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C +## [9] LC_ADDRESS=C LC_TELEPHONE=C +## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C +## +## attached base packages: +## [1] stats4 parallel stats graphics grDevices utils datasets +## [8] methods base +## +## other attached packages: +## [1] RColorBrewer_1.1-2 reshape2_1.4.3 +## [3] harmony_1.0 Rcpp_1.0.2 +## [5] sva_3.32.1 genefilter_1.66.0 +## [7] mgcv_1.8-28 nlme_3.1-139 +## [9] kBET_0.99.6 scran_1.12.1 +## [11] scater_1.12.2 ggplot2_3.2.1 +## [13] SingleCellExperiment_1.6.0 RUVSeq_1.18.0 +## [15] edgeR_3.26.8 limma_3.40.6 +## [17] EDASeq_2.18.0 ShortRead_1.42.0 +## [19] GenomicAlignments_1.20.1 SummarizedExperiment_1.14.1 +## [21] DelayedArray_0.10.0 matrixStats_0.55.0 +## [23] Rsamtools_2.0.1 GenomicRanges_1.36.1 +## [25] GenomeInfoDb_1.20.0 Biostrings_2.52.0 +## [27] XVector_0.24.0 IRanges_2.18.3 +## [29] S4Vectors_0.22.1 BiocParallel_1.18.1 +## [31] Biobase_2.44.0 BiocGenerics_0.30.0 +## [33] scRNA.seq.funcs_0.1.0 +## +## loaded via a namespace (and not attached): +## [1] backports_1.1.4 aroma.light_3.14.0 +## [3] plyr_1.8.4 igraph_1.2.4.1 +## [5] lazyeval_0.2.2 splines_3.6.0 +## [7] listenv_0.7.0 elliptic_1.4-0 +## [9] digest_0.6.21 htmltools_0.3.6 +## [11] viridis_0.5.1 magrittr_1.5 +## [13] memoise_1.1.0 contfrac_1.1-12 +## [15] cluster_2.1.0 globals_0.12.4 +## [17] annotate_1.62.0 R.utils_2.9.0 +## [19] prettyunits_1.0.2 colorspace_1.4-1 +## [21] blob_1.2.0 xfun_0.9 +## [23] dplyr_0.8.3 crayon_1.3.4 +## [25] RCurl_1.95-4.12 zeallot_0.1.0 +## [27] survival_2.43-3 glue_1.3.1 +## [29] gtable_0.3.0 zlibbioc_1.30.0 +## [31] BiocSingular_1.0.0 future.apply_1.3.0 +## [33] scales_1.0.0 DESeq_1.36.0 +## [35] DBI_1.0.0 viridisLite_0.3.0 +## [37] xtable_1.8-4 progress_1.2.2 +## [39] dqrng_0.2.1 bit_1.1-14 +## [41] rsvd_1.0.2 deSolve_1.24 +## [43] httr_1.4.1 FNN_1.1.3 +## [45] pkgconfig_2.0.3 XML_3.98-1.20 +## [47] R.methodsS3_1.7.1 locfit_1.5-9.1 +## [49] dynamicTreeCut_1.63-1 tidyselect_0.2.5 +## [51] labeling_0.3 rlang_0.4.0 +## [53] AnnotationDbi_1.46.1 munsell_0.5.0 +## [55] tools_3.6.0 moments_0.14 +## [57] RSQLite_2.1.2 batchelor_1.0.1 +## [59] evaluate_0.14 stringr_1.4.0 +## [61] yaml_2.2.0 knitr_1.25 +## [63] bit64_0.9-7 hypergeo_1.2-13 +## [65] purrr_0.3.2 future_1.14.0 +## [67] R.oo_1.22.0 biomaRt_2.40.4 +## [69] compiler_3.6.0 beeswarm_0.2.3 +## [71] tibble_2.1.3 statmod_1.4.32 +## [73] geneplotter_1.62.0 stringi_1.4.3 +## [75] highr_0.8 GenomicFeatures_1.36.4 +## [77] lattice_0.20-38 Matrix_1.2-17 +## [79] vctrs_0.2.0 lifecycle_0.1.0 +## [81] pillar_1.4.2 BiocNeighbors_1.2.0 +## [83] cowplot_1.0.0 bitops_1.0-6 +## [85] orthopolynom_1.0-5 irlba_2.3.3 +## [87] rtracklayer_1.44.4 R6_2.4.0 +## [89] latticeExtra_0.6-28 hwriter_1.3.2 +## [91] bookdown_0.13 gridExtra_2.3 +## [93] vipor_0.4.5 codetools_0.2-16 +## [95] MASS_7.3-51.1 assertthat_0.2.1 +## [97] withr_2.1.2 sctransform_0.2.0 +## [99] GenomeInfoDbData_1.2.1 hms_0.5.1 +## [101] grid_3.6.0 tidyr_1.0.0 +## [103] rmarkdown_1.15 DelayedMatrixStats_1.6.1 +## [105] Rtsne_0.15 ggbeeswarm_0.6.0</code></pre> </div> </div> <div id="dealing-with-confounders-reads" class="section level2"> <h2><span class="header-section-number">7.7</span> Dealing with confounders (Reads)</h2> -<div class="sourceCode" id="cb529"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb529-1" data-line-number="1"><span class="kw">library</span>(scRNA.seq.funcs)</a> -<a class="sourceLine" id="cb529-2" data-line-number="2"><span class="kw">library</span>(RUVSeq)</a> -<a class="sourceLine" id="cb529-3" data-line-number="3"><span class="kw">library</span>(scater)</a> -<a class="sourceLine" id="cb529-4" data-line-number="4"><span class="kw">library</span>(SingleCellExperiment)</a> -<a class="sourceLine" id="cb529-5" data-line-number="5"><span class="kw">library</span>(scran)</a> -<a class="sourceLine" id="cb529-6" data-line-number="6"><span class="kw">library</span>(kBET)</a> -<a class="sourceLine" id="cb529-7" data-line-number="7"><span class="kw">library</span>(sva) <span class="co"># Combat</span></a> -<a class="sourceLine" id="cb529-8" data-line-number="8"><span class="kw">library</span>(harmony)</a> -<a class="sourceLine" id="cb529-9" data-line-number="9"><span class="kw">library</span>(edgeR)</a> -<a class="sourceLine" id="cb529-10" data-line-number="10"><span class="kw">set.seed</span>(<span class="dv">1234567</span>)</a> -<a class="sourceLine" id="cb529-11" data-line-number="11"><span class="kw">options</span>(<span class="dt">stringsAsFactors =</span> <span class="ot">FALSE</span>)</a> -<a class="sourceLine" id="cb529-12" data-line-number="12">reads <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/tung/reads.rds"</span>)</a> -<a class="sourceLine" id="cb529-13" data-line-number="13">reads.qc <-<span class="st"> </span>reads[<span class="kw">rowData</span>(reads)<span class="op">$</span>use, <span class="kw">colData</span>(reads)<span class="op">$</span>use]</a> -<a class="sourceLine" id="cb529-14" data-line-number="14">endog_genes <-<span class="st"> </span><span class="op">!</span><span class="kw">rowData</span>(reads.qc)<span class="op">$</span>is_feature_control</a> -<a class="sourceLine" id="cb529-15" data-line-number="15">erccs <-<span class="st"> </span><span class="kw">rowData</span>(reads.qc)<span class="op">$</span>is_feature_control</a> -<a class="sourceLine" id="cb529-16" data-line-number="16"></a> -<a class="sourceLine" id="cb529-17" data-line-number="17">qclust <-<span class="st"> </span><span class="kw">quickCluster</span>(reads.qc, <span class="dt">min.size =</span> <span class="dv">30</span>)</a> -<a class="sourceLine" id="cb529-18" data-line-number="18">reads.qc <-<span class="st"> </span><span class="kw">computeSumFactors</span>(reads.qc, <span class="dt">sizes =</span> <span class="dv">15</span>, <span class="dt">clusters =</span> qclust)</a> -<a class="sourceLine" id="cb529-19" data-line-number="19">reads.qc <-<span class="st"> </span><span class="kw">normalize</span>(reads.qc)</a></code></pre></div> -<div class="sourceCode" id="cb530"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb530-1" data-line-number="1">ruvg <-<span class="st"> </span><span class="kw">RUVg</span>(<span class="kw">counts</span>(reads.qc), erccs, <span class="dt">k =</span> <span class="dv">1</span>)</a> -<a class="sourceLine" id="cb530-2" data-line-number="2"><span class="kw">assay</span>(reads.qc, <span class="st">"ruvg1"</span>) <-<span class="st"> </span><span class="kw">log2</span>(</a> -<a class="sourceLine" id="cb530-3" data-line-number="3"> <span class="kw">t</span>(<span class="kw">t</span>(ruvg<span class="op">$</span>normalizedCounts) <span class="op">/</span><span class="st"> </span><span class="kw">colSums</span>(ruvg<span class="op">$</span>normalizedCounts) <span class="op">*</span><span class="st"> </span><span class="fl">1e6</span>) <span class="op">+</span><span class="st"> </span><span class="dv">1</span></a> -<a class="sourceLine" id="cb530-4" data-line-number="4">)</a> -<a class="sourceLine" id="cb530-5" data-line-number="5">ruvg <-<span class="st"> </span><span class="kw">RUVg</span>(<span class="kw">counts</span>(reads.qc), erccs, <span class="dt">k =</span> <span class="dv">10</span>)</a> -<a class="sourceLine" id="cb530-6" data-line-number="6"><span class="kw">assay</span>(reads.qc, <span class="st">"ruvg10"</span>) <-<span class="st"> </span><span class="kw">log2</span>(</a> -<a class="sourceLine" id="cb530-7" data-line-number="7"> <span class="kw">t</span>(<span class="kw">t</span>(ruvg<span class="op">$</span>normalizedCounts) <span class="op">/</span><span class="st"> </span><span class="kw">colSums</span>(ruvg<span class="op">$</span>normalizedCounts) <span class="op">*</span><span class="st"> </span><span class="fl">1e6</span>) <span class="op">+</span><span class="st"> </span><span class="dv">1</span></a> -<a class="sourceLine" id="cb530-8" data-line-number="8">)</a></code></pre></div> -<div class="sourceCode" id="cb531"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb531-1" data-line-number="1">scIdx <-<span class="st"> </span><span class="kw">matrix</span>(<span class="op">-</span><span class="dv">1</span>, <span class="dt">ncol =</span> <span class="kw">max</span>(<span class="kw">table</span>(reads.qc<span class="op">$</span>individual)), <span class="dt">nrow =</span> <span class="dv">3</span>)</a> -<a class="sourceLine" id="cb531-2" data-line-number="2">tmp <-<span class="st"> </span><span class="kw">which</span>(reads.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19098"</span>)</a> -<a class="sourceLine" id="cb531-3" data-line-number="3">scIdx[<span class="dv">1</span>, <span class="dv">1</span><span class="op">:</span><span class="kw">length</span>(tmp)] <-<span class="st"> </span>tmp</a> -<a class="sourceLine" id="cb531-4" data-line-number="4">tmp <-<span class="st"> </span><span class="kw">which</span>(reads.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19101"</span>)</a> -<a class="sourceLine" id="cb531-5" data-line-number="5">scIdx[<span class="dv">2</span>, <span class="dv">1</span><span class="op">:</span><span class="kw">length</span>(tmp)] <-<span class="st"> </span>tmp</a> -<a class="sourceLine" id="cb531-6" data-line-number="6">tmp <-<span class="st"> </span><span class="kw">which</span>(reads.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19239"</span>)</a> -<a class="sourceLine" id="cb531-7" data-line-number="7">scIdx[<span class="dv">3</span>, <span class="dv">1</span><span class="op">:</span><span class="kw">length</span>(tmp)] <-<span class="st"> </span>tmp</a> -<a class="sourceLine" id="cb531-8" data-line-number="8">cIdx <-<span class="st"> </span><span class="kw">rownames</span>(reads.qc)</a> -<a class="sourceLine" id="cb531-9" data-line-number="9">ruvs <-<span class="st"> </span><span class="kw">RUVs</span>(<span class="kw">counts</span>(reads.qc), cIdx, <span class="dt">k =</span> <span class="dv">1</span>, <span class="dt">scIdx =</span> scIdx, <span class="dt">isLog =</span> <span class="ot">FALSE</span>)</a> -<a class="sourceLine" id="cb531-10" data-line-number="10"><span class="kw">assay</span>(reads.qc, <span class="st">"ruvs1"</span>) <-<span class="st"> </span><span class="kw">log2</span>(</a> -<a class="sourceLine" id="cb531-11" data-line-number="11"> <span class="kw">t</span>(<span class="kw">t</span>(ruvs<span class="op">$</span>normalizedCounts) <span class="op">/</span><span class="st"> </span><span class="kw">colSums</span>(ruvs<span class="op">$</span>normalizedCounts) <span class="op">*</span><span class="st"> </span><span class="fl">1e6</span>) <span class="op">+</span><span class="st"> </span><span class="dv">1</span></a> -<a class="sourceLine" id="cb531-12" data-line-number="12">)</a> -<a class="sourceLine" id="cb531-13" data-line-number="13">ruvs <-<span class="st"> </span><span class="kw">RUVs</span>(<span class="kw">counts</span>(reads.qc), cIdx, <span class="dt">k =</span> <span class="dv">10</span>, <span class="dt">scIdx =</span> scIdx, <span class="dt">isLog =</span> <span class="ot">FALSE</span>)</a> -<a class="sourceLine" id="cb531-14" data-line-number="14"><span class="kw">assay</span>(reads.qc, <span class="st">"ruvs10"</span>) <-<span class="st"> </span><span class="kw">log2</span>(</a> -<a class="sourceLine" id="cb531-15" data-line-number="15"> <span class="kw">t</span>(<span class="kw">t</span>(ruvs<span class="op">$</span>normalizedCounts) <span class="op">/</span><span class="st"> </span><span class="kw">colSums</span>(ruvs<span class="op">$</span>normalizedCounts) <span class="op">*</span><span class="st"> </span><span class="fl">1e6</span>) <span class="op">+</span><span class="st"> </span><span class="dv">1</span></a> -<a class="sourceLine" id="cb531-16" data-line-number="16">)</a></code></pre></div> -<div class="sourceCode" id="cb532"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb532-1" data-line-number="1">combat_data <-<span class="st"> </span><span class="kw">logcounts</span>(reads.qc)</a> -<a class="sourceLine" id="cb532-2" data-line-number="2">mod_data <-<span class="st"> </span><span class="kw">as.data.frame</span>(<span class="kw">t</span>(combat_data))</a> -<a class="sourceLine" id="cb532-3" data-line-number="3"><span class="co"># Basic batch removal</span></a> -<a class="sourceLine" id="cb532-4" data-line-number="4">mod0 =<span class="st"> </span><span class="kw">model.matrix</span>(<span class="op">~</span><span class="st"> </span><span class="dv">1</span>, <span class="dt">data =</span> mod_data) </a> -<a class="sourceLine" id="cb532-5" data-line-number="5"><span class="co"># Preserve biological variability</span></a> -<a class="sourceLine" id="cb532-6" data-line-number="6">mod1 =<span class="st"> </span><span class="kw">model.matrix</span>(<span class="op">~</span><span class="st"> </span>reads.qc<span class="op">$</span>individual, <span class="dt">data =</span> mod_data) </a> -<a class="sourceLine" id="cb532-7" data-line-number="7"><span class="co"># adjust for total genes detected</span></a> -<a class="sourceLine" id="cb532-8" data-line-number="8">mod2 =<span class="st"> </span><span class="kw">model.matrix</span>(<span class="op">~</span><span class="st"> </span>reads.qc<span class="op">$</span>total_features_by_counts, <span class="dt">data =</span> mod_data)</a> -<a class="sourceLine" id="cb532-9" data-line-number="9"><span class="kw">assay</span>(reads.qc, <span class="st">"combat"</span>) <-<span class="st"> </span><span class="kw">ComBat</span>(</a> -<a class="sourceLine" id="cb532-10" data-line-number="10"> <span class="dt">dat =</span> <span class="kw">t</span>(mod_data), </a> -<a class="sourceLine" id="cb532-11" data-line-number="11"> <span class="dt">batch =</span> <span class="kw">factor</span>(reads.qc<span class="op">$</span>batch), </a> -<a class="sourceLine" id="cb532-12" data-line-number="12"> <span class="dt">mod =</span> mod0,</a> -<a class="sourceLine" id="cb532-13" data-line-number="13"> <span class="dt">par.prior =</span> <span class="ot">TRUE</span>,</a> -<a class="sourceLine" id="cb532-14" data-line-number="14"> <span class="dt">prior.plots =</span> <span class="ot">FALSE</span></a> -<a class="sourceLine" id="cb532-15" data-line-number="15">)</a></code></pre></div> +<div class="sourceCode" id="cb573"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb573-1" data-line-number="1"><span class="kw">library</span>(scRNA.seq.funcs)</a> +<a class="sourceLine" id="cb573-2" data-line-number="2"><span class="kw">library</span>(RUVSeq)</a> +<a class="sourceLine" id="cb573-3" data-line-number="3"><span class="kw">library</span>(scater)</a> +<a class="sourceLine" id="cb573-4" data-line-number="4"><span class="kw">library</span>(SingleCellExperiment)</a> +<a class="sourceLine" id="cb573-5" data-line-number="5"><span class="kw">library</span>(scran)</a> +<a class="sourceLine" id="cb573-6" data-line-number="6"><span class="kw">library</span>(kBET)</a> +<a class="sourceLine" id="cb573-7" data-line-number="7"><span class="kw">library</span>(sva) <span class="co"># Combat</span></a> +<a class="sourceLine" id="cb573-8" data-line-number="8"><span class="kw">library</span>(harmony)</a> +<a class="sourceLine" id="cb573-9" data-line-number="9"><span class="kw">library</span>(edgeR)</a> +<a class="sourceLine" id="cb573-10" data-line-number="10"><span class="kw">set.seed</span>(<span class="dv">1234567</span>)</a> +<a class="sourceLine" id="cb573-11" data-line-number="11"><span class="kw">options</span>(<span class="dt">stringsAsFactors =</span> <span class="ot">FALSE</span>)</a> +<a class="sourceLine" id="cb573-12" data-line-number="12">reads <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/tung/reads.rds"</span>)</a> +<a class="sourceLine" id="cb573-13" data-line-number="13">reads.qc <-<span class="st"> </span>reads[<span class="kw">rowData</span>(reads)<span class="op">$</span>use, <span class="kw">colData</span>(reads)<span class="op">$</span>use]</a> +<a class="sourceLine" id="cb573-14" data-line-number="14">endog_genes <-<span class="st"> </span><span class="op">!</span><span class="kw">rowData</span>(reads.qc)<span class="op">$</span>is_feature_control</a> +<a class="sourceLine" id="cb573-15" data-line-number="15">erccs <-<span class="st"> </span><span class="kw">rowData</span>(reads.qc)<span class="op">$</span>is_feature_control</a> +<a class="sourceLine" id="cb573-16" data-line-number="16"></a> +<a class="sourceLine" id="cb573-17" data-line-number="17">qclust <-<span class="st"> </span><span class="kw">quickCluster</span>(reads.qc, <span class="dt">min.size =</span> <span class="dv">30</span>)</a> +<a class="sourceLine" id="cb573-18" data-line-number="18">reads.qc <-<span class="st"> </span><span class="kw">computeSumFactors</span>(reads.qc, <span class="dt">sizes =</span> <span class="dv">15</span>, <span class="dt">clusters =</span> qclust)</a> +<a class="sourceLine" id="cb573-19" data-line-number="19">reads.qc <-<span class="st"> </span><span class="kw">normalize</span>(reads.qc)</a></code></pre></div> +<div class="sourceCode" id="cb574"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb574-1" data-line-number="1">ruvg <-<span class="st"> </span><span class="kw">RUVg</span>(<span class="kw">counts</span>(reads.qc), erccs, <span class="dt">k =</span> <span class="dv">1</span>)</a> +<a class="sourceLine" id="cb574-2" data-line-number="2"><span class="kw">assay</span>(reads.qc, <span class="st">"ruvg1"</span>) <-<span class="st"> </span><span class="kw">log2</span>(</a> +<a class="sourceLine" id="cb574-3" data-line-number="3"> <span class="kw">t</span>(<span class="kw">t</span>(ruvg<span class="op">$</span>normalizedCounts) <span class="op">/</span><span class="st"> </span><span class="kw">colSums</span>(ruvg<span class="op">$</span>normalizedCounts) <span class="op">*</span><span class="st"> </span><span class="fl">1e6</span>) <span class="op">+</span><span class="st"> </span><span class="dv">1</span></a> +<a class="sourceLine" id="cb574-4" data-line-number="4">)</a> +<a class="sourceLine" id="cb574-5" data-line-number="5">ruvg <-<span class="st"> </span><span class="kw">RUVg</span>(<span class="kw">counts</span>(reads.qc), erccs, <span class="dt">k =</span> <span class="dv">10</span>)</a> +<a class="sourceLine" id="cb574-6" data-line-number="6"><span class="kw">assay</span>(reads.qc, <span class="st">"ruvg10"</span>) <-<span class="st"> </span><span class="kw">log2</span>(</a> +<a class="sourceLine" id="cb574-7" data-line-number="7"> <span class="kw">t</span>(<span class="kw">t</span>(ruvg<span class="op">$</span>normalizedCounts) <span class="op">/</span><span class="st"> </span><span class="kw">colSums</span>(ruvg<span class="op">$</span>normalizedCounts) <span class="op">*</span><span class="st"> </span><span class="fl">1e6</span>) <span class="op">+</span><span class="st"> </span><span class="dv">1</span></a> +<a class="sourceLine" id="cb574-8" data-line-number="8">)</a></code></pre></div> +<div class="sourceCode" id="cb575"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb575-1" data-line-number="1">scIdx <-<span class="st"> </span><span class="kw">matrix</span>(<span class="op">-</span><span class="dv">1</span>, <span class="dt">ncol =</span> <span class="kw">max</span>(<span class="kw">table</span>(reads.qc<span class="op">$</span>individual)), <span class="dt">nrow =</span> <span class="dv">3</span>)</a> +<a class="sourceLine" id="cb575-2" data-line-number="2">tmp <-<span class="st"> </span><span class="kw">which</span>(reads.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19098"</span>)</a> +<a class="sourceLine" id="cb575-3" data-line-number="3">scIdx[<span class="dv">1</span>, <span class="dv">1</span><span class="op">:</span><span class="kw">length</span>(tmp)] <-<span class="st"> </span>tmp</a> +<a class="sourceLine" id="cb575-4" data-line-number="4">tmp <-<span class="st"> </span><span class="kw">which</span>(reads.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19101"</span>)</a> +<a class="sourceLine" id="cb575-5" data-line-number="5">scIdx[<span class="dv">2</span>, <span class="dv">1</span><span class="op">:</span><span class="kw">length</span>(tmp)] <-<span class="st"> </span>tmp</a> +<a class="sourceLine" id="cb575-6" data-line-number="6">tmp <-<span class="st"> </span><span class="kw">which</span>(reads.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19239"</span>)</a> +<a class="sourceLine" id="cb575-7" data-line-number="7">scIdx[<span class="dv">3</span>, <span class="dv">1</span><span class="op">:</span><span class="kw">length</span>(tmp)] <-<span class="st"> </span>tmp</a> +<a class="sourceLine" id="cb575-8" data-line-number="8">cIdx <-<span class="st"> </span><span class="kw">rownames</span>(reads.qc)</a> +<a class="sourceLine" id="cb575-9" data-line-number="9">ruvs <-<span class="st"> </span><span class="kw">RUVs</span>(<span class="kw">counts</span>(reads.qc), cIdx, <span class="dt">k =</span> <span class="dv">1</span>, <span class="dt">scIdx =</span> scIdx, <span class="dt">isLog =</span> <span class="ot">FALSE</span>)</a> +<a class="sourceLine" id="cb575-10" data-line-number="10"><span class="kw">assay</span>(reads.qc, <span class="st">"ruvs1"</span>) <-<span class="st"> </span><span class="kw">log2</span>(</a> +<a class="sourceLine" id="cb575-11" data-line-number="11"> <span class="kw">t</span>(<span class="kw">t</span>(ruvs<span class="op">$</span>normalizedCounts) <span class="op">/</span><span class="st"> </span><span class="kw">colSums</span>(ruvs<span class="op">$</span>normalizedCounts) <span class="op">*</span><span class="st"> </span><span class="fl">1e6</span>) <span class="op">+</span><span class="st"> </span><span class="dv">1</span></a> +<a class="sourceLine" id="cb575-12" data-line-number="12">)</a> +<a class="sourceLine" id="cb575-13" data-line-number="13">ruvs <-<span class="st"> </span><span class="kw">RUVs</span>(<span class="kw">counts</span>(reads.qc), cIdx, <span class="dt">k =</span> <span class="dv">10</span>, <span class="dt">scIdx =</span> scIdx, <span class="dt">isLog =</span> <span class="ot">FALSE</span>)</a> +<a class="sourceLine" id="cb575-14" data-line-number="14"><span class="kw">assay</span>(reads.qc, <span class="st">"ruvs10"</span>) <-<span class="st"> </span><span class="kw">log2</span>(</a> +<a class="sourceLine" id="cb575-15" data-line-number="15"> <span class="kw">t</span>(<span class="kw">t</span>(ruvs<span class="op">$</span>normalizedCounts) <span class="op">/</span><span class="st"> </span><span class="kw">colSums</span>(ruvs<span class="op">$</span>normalizedCounts) <span class="op">*</span><span class="st"> </span><span class="fl">1e6</span>) <span class="op">+</span><span class="st"> </span><span class="dv">1</span></a> +<a class="sourceLine" id="cb575-16" data-line-number="16">)</a></code></pre></div> +<div class="sourceCode" id="cb576"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb576-1" data-line-number="1">combat_data <-<span class="st"> </span><span class="kw">logcounts</span>(reads.qc)</a> +<a class="sourceLine" id="cb576-2" data-line-number="2">mod_data <-<span class="st"> </span><span class="kw">as.data.frame</span>(<span class="kw">t</span>(combat_data))</a> +<a class="sourceLine" id="cb576-3" data-line-number="3"><span class="co"># Basic batch removal</span></a> +<a class="sourceLine" id="cb576-4" data-line-number="4">mod0 =<span class="st"> </span><span class="kw">model.matrix</span>(<span class="op">~</span><span class="st"> </span><span class="dv">1</span>, <span class="dt">data =</span> mod_data) </a> +<a class="sourceLine" id="cb576-5" data-line-number="5"><span class="co"># Preserve biological variability</span></a> +<a class="sourceLine" id="cb576-6" data-line-number="6">mod1 =<span class="st"> </span><span class="kw">model.matrix</span>(<span class="op">~</span><span class="st"> </span>reads.qc<span class="op">$</span>individual, <span class="dt">data =</span> mod_data) </a> +<a class="sourceLine" id="cb576-7" data-line-number="7"><span class="co"># adjust for total genes detected</span></a> +<a class="sourceLine" id="cb576-8" data-line-number="8">mod2 =<span class="st"> </span><span class="kw">model.matrix</span>(<span class="op">~</span><span class="st"> </span>reads.qc<span class="op">$</span>total_features_by_counts, <span class="dt">data =</span> mod_data)</a> +<a class="sourceLine" id="cb576-9" data-line-number="9"><span class="kw">assay</span>(reads.qc, <span class="st">"combat"</span>) <-<span class="st"> </span><span class="kw">ComBat</span>(</a> +<a class="sourceLine" id="cb576-10" data-line-number="10"> <span class="dt">dat =</span> <span class="kw">t</span>(mod_data), </a> +<a class="sourceLine" id="cb576-11" data-line-number="11"> <span class="dt">batch =</span> <span class="kw">factor</span>(reads.qc<span class="op">$</span>batch), </a> +<a class="sourceLine" id="cb576-12" data-line-number="12"> <span class="dt">mod =</span> mod0,</a> +<a class="sourceLine" id="cb576-13" data-line-number="13"> <span class="dt">par.prior =</span> <span class="ot">TRUE</span>,</a> +<a class="sourceLine" id="cb576-14" data-line-number="14"> <span class="dt">prior.plots =</span> <span class="ot">FALSE</span></a> +<a class="sourceLine" id="cb576-15" data-line-number="15">)</a></code></pre></div> <p><strong>Exercise 1</strong></p> -<div class="sourceCode" id="cb533"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb533-1" data-line-number="1">do_mnn <-<span class="st"> </span><span class="cf">function</span>(data.qc) {</a> -<a class="sourceLine" id="cb533-2" data-line-number="2"> batch1 <-<span class="st"> </span><span class="kw">logcounts</span>(data.qc[, data.qc<span class="op">$</span>replicate <span class="op">==</span><span class="st"> "r1"</span>])</a> -<a class="sourceLine" id="cb533-3" data-line-number="3"> batch2 <-<span class="st"> </span><span class="kw">logcounts</span>(data.qc[, data.qc<span class="op">$</span>replicate <span class="op">==</span><span class="st"> "r2"</span>])</a> -<a class="sourceLine" id="cb533-4" data-line-number="4"> batch3 <-<span class="st"> </span><span class="kw">logcounts</span>(data.qc[, data.qc<span class="op">$</span>replicate <span class="op">==</span><span class="st"> "r3"</span>])</a> -<a class="sourceLine" id="cb533-5" data-line-number="5"> </a> -<a class="sourceLine" id="cb533-6" data-line-number="6"> <span class="cf">if</span> (<span class="kw">ncol</span>(batch2) <span class="op">></span><span class="st"> </span><span class="dv">0</span>) {</a> -<a class="sourceLine" id="cb533-7" data-line-number="7"> x <-<span class="st"> </span>batchelor<span class="op">::</span><span class="kw">mnnCorrect</span>(</a> -<a class="sourceLine" id="cb533-8" data-line-number="8"> batch1, batch2, batch3,</a> -<a class="sourceLine" id="cb533-9" data-line-number="9"> <span class="dt">k =</span> <span class="dv">20</span>,</a> -<a class="sourceLine" id="cb533-10" data-line-number="10"> <span class="dt">sigma =</span> <span class="fl">0.1</span>,</a> -<a class="sourceLine" id="cb533-11" data-line-number="11"> <span class="dt">cos.norm.in =</span> <span class="ot">TRUE</span>,</a> -<a class="sourceLine" id="cb533-12" data-line-number="12"> <span class="dt">svd.dim =</span> <span class="dv">2</span></a> -<a class="sourceLine" id="cb533-13" data-line-number="13"> )</a> -<a class="sourceLine" id="cb533-14" data-line-number="14"> <span class="kw">return</span>(x)</a> -<a class="sourceLine" id="cb533-15" data-line-number="15"> } <span class="cf">else</span> {</a> -<a class="sourceLine" id="cb533-16" data-line-number="16"> x <-<span class="st"> </span>batchelor<span class="op">::</span><span class="kw">mnnCorrect</span>(</a> -<a class="sourceLine" id="cb533-17" data-line-number="17"> batch1, batch3,</a> -<a class="sourceLine" id="cb533-18" data-line-number="18"> <span class="dt">k =</span> <span class="dv">20</span>,</a> -<a class="sourceLine" id="cb533-19" data-line-number="19"> <span class="dt">sigma =</span> <span class="fl">0.1</span>,</a> -<a class="sourceLine" id="cb533-20" data-line-number="20"> <span class="dt">cos.norm.in =</span> <span class="ot">TRUE</span>,</a> -<a class="sourceLine" id="cb533-21" data-line-number="21"> <span class="dt">svd.dim =</span> <span class="dv">2</span></a> -<a class="sourceLine" id="cb533-22" data-line-number="22"> )</a> -<a class="sourceLine" id="cb533-23" data-line-number="23"> <span class="kw">return</span>(x)</a> -<a class="sourceLine" id="cb533-24" data-line-number="24"> }</a> -<a class="sourceLine" id="cb533-25" data-line-number="25">}</a> -<a class="sourceLine" id="cb533-26" data-line-number="26"></a> -<a class="sourceLine" id="cb533-27" data-line-number="27">indi1 <-<span class="st"> </span><span class="kw">do_mnn</span>(reads.qc[, reads.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19098"</span>])</a> -<a class="sourceLine" id="cb533-28" data-line-number="28">indi2 <-<span class="st"> </span><span class="kw">do_mnn</span>(reads.qc[, reads.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19101"</span>])</a> -<a class="sourceLine" id="cb533-29" data-line-number="29">indi3 <-<span class="st"> </span><span class="kw">do_mnn</span>(reads.qc[, reads.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19239"</span>])</a> -<a class="sourceLine" id="cb533-30" data-line-number="30"></a> -<a class="sourceLine" id="cb533-31" data-line-number="31"><span class="kw">assay</span>(reads.qc, <span class="st">"mnn"</span>) <-<span class="st"> </span><span class="kw">cbind</span>(indi1, indi2, indi3)</a> -<a class="sourceLine" id="cb533-32" data-line-number="32"></a> -<a class="sourceLine" id="cb533-33" data-line-number="33"><span class="co"># For a balanced design: </span></a> -<a class="sourceLine" id="cb533-34" data-line-number="34"><span class="co">#assay(reads.qc, "mnn") <- mnnCorrect(</span></a> -<a class="sourceLine" id="cb533-35" data-line-number="35"><span class="co"># list(B1 = logcounts(batch1), B2 = logcounts(batch2), B3 = logcounts(batch3)), </span></a> -<a class="sourceLine" id="cb533-36" data-line-number="36"><span class="co"># k = 20,</span></a> -<a class="sourceLine" id="cb533-37" data-line-number="37"><span class="co"># sigma = 0.1,</span></a> -<a class="sourceLine" id="cb533-38" data-line-number="38"><span class="co"># cos.norm = TRUE,</span></a> -<a class="sourceLine" id="cb533-39" data-line-number="39"><span class="co"># svd.dim = 2</span></a> -<a class="sourceLine" id="cb533-40" data-line-number="40"><span class="co">#)</span></a></code></pre></div> -<div class="sourceCode" id="cb534"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb534-1" data-line-number="1">glm_fun <-<span class="st"> </span><span class="cf">function</span>(g, batch, indi) {</a> -<a class="sourceLine" id="cb534-2" data-line-number="2"> model <-<span class="st"> </span><span class="kw">glm</span>(g <span class="op">~</span><span class="st"> </span>batch <span class="op">+</span><span class="st"> </span>indi)</a> -<a class="sourceLine" id="cb534-3" data-line-number="3"> model<span class="op">$</span>coef[<span class="dv">1</span>] <-<span class="st"> </span><span class="dv">0</span> <span class="co"># replace intercept with 0 to preserve reference batch.</span></a> -<a class="sourceLine" id="cb534-4" data-line-number="4"> <span class="kw">return</span>(model<span class="op">$</span>coef)</a> -<a class="sourceLine" id="cb534-5" data-line-number="5">}</a> -<a class="sourceLine" id="cb534-6" data-line-number="6">effects <-<span class="st"> </span><span class="kw">apply</span>(</a> -<a class="sourceLine" id="cb534-7" data-line-number="7"> <span class="kw">logcounts</span>(reads.qc), </a> -<a class="sourceLine" id="cb534-8" data-line-number="8"> <span class="dv">1</span>,</a> -<a class="sourceLine" id="cb534-9" data-line-number="9"> glm_fun, </a> -<a class="sourceLine" id="cb534-10" data-line-number="10"> <span class="dt">batch =</span> reads.qc<span class="op">$</span>batch, </a> -<a class="sourceLine" id="cb534-11" data-line-number="11"> <span class="dt">indi =</span> reads.qc<span class="op">$</span>individual</a> -<a class="sourceLine" id="cb534-12" data-line-number="12">)</a> -<a class="sourceLine" id="cb534-13" data-line-number="13">corrected <-<span class="st"> </span><span class="kw">logcounts</span>(reads.qc) <span class="op">-</span><span class="st"> </span><span class="kw">t</span>(effects[<span class="kw">as.numeric</span>(<span class="kw">factor</span>(reads.qc<span class="op">$</span>batch)), ])</a> -<a class="sourceLine" id="cb534-14" data-line-number="14"><span class="kw">assay</span>(reads.qc, <span class="st">"glm"</span>) <-<span class="st"> </span>corrected</a></code></pre></div> +<div class="sourceCode" id="cb577"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb577-1" data-line-number="1">do_mnn <-<span class="st"> </span><span class="cf">function</span>(data.qc) {</a> +<a class="sourceLine" id="cb577-2" data-line-number="2"> batch1 <-<span class="st"> </span><span class="kw">logcounts</span>(data.qc[, data.qc<span class="op">$</span>replicate <span class="op">==</span><span class="st"> "r1"</span>])</a> +<a class="sourceLine" id="cb577-3" data-line-number="3"> batch2 <-<span class="st"> </span><span class="kw">logcounts</span>(data.qc[, data.qc<span class="op">$</span>replicate <span class="op">==</span><span class="st"> "r2"</span>])</a> +<a class="sourceLine" id="cb577-4" data-line-number="4"> batch3 <-<span class="st"> </span><span class="kw">logcounts</span>(data.qc[, data.qc<span class="op">$</span>replicate <span class="op">==</span><span class="st"> "r3"</span>])</a> +<a class="sourceLine" id="cb577-5" data-line-number="5"> </a> +<a class="sourceLine" id="cb577-6" data-line-number="6"> <span class="cf">if</span> (<span class="kw">ncol</span>(batch2) <span class="op">></span><span class="st"> </span><span class="dv">0</span>) {</a> +<a class="sourceLine" id="cb577-7" data-line-number="7"> x <-<span class="st"> </span>batchelor<span class="op">::</span><span class="kw">mnnCorrect</span>(</a> +<a class="sourceLine" id="cb577-8" data-line-number="8"> batch1, batch2, batch3,</a> +<a class="sourceLine" id="cb577-9" data-line-number="9"> <span class="dt">k =</span> <span class="dv">20</span>,</a> +<a class="sourceLine" id="cb577-10" data-line-number="10"> <span class="dt">sigma =</span> <span class="fl">0.1</span>,</a> +<a class="sourceLine" id="cb577-11" data-line-number="11"> <span class="dt">cos.norm.in =</span> <span class="ot">TRUE</span>,</a> +<a class="sourceLine" id="cb577-12" data-line-number="12"> <span class="dt">svd.dim =</span> <span class="dv">2</span></a> +<a class="sourceLine" id="cb577-13" data-line-number="13"> )</a> +<a class="sourceLine" id="cb577-14" data-line-number="14"> <span class="kw">return</span>(x)</a> +<a class="sourceLine" id="cb577-15" data-line-number="15"> } <span class="cf">else</span> {</a> +<a class="sourceLine" id="cb577-16" data-line-number="16"> x <-<span class="st"> </span>batchelor<span class="op">::</span><span class="kw">mnnCorrect</span>(</a> +<a class="sourceLine" id="cb577-17" data-line-number="17"> batch1, batch3,</a> +<a class="sourceLine" id="cb577-18" data-line-number="18"> <span class="dt">k =</span> <span class="dv">20</span>,</a> +<a class="sourceLine" id="cb577-19" data-line-number="19"> <span class="dt">sigma =</span> <span class="fl">0.1</span>,</a> +<a class="sourceLine" id="cb577-20" data-line-number="20"> <span class="dt">cos.norm.in =</span> <span class="ot">TRUE</span>,</a> +<a class="sourceLine" id="cb577-21" data-line-number="21"> <span class="dt">svd.dim =</span> <span class="dv">2</span></a> +<a class="sourceLine" id="cb577-22" data-line-number="22"> )</a> +<a class="sourceLine" id="cb577-23" data-line-number="23"> <span class="kw">return</span>(x)</a> +<a class="sourceLine" id="cb577-24" data-line-number="24"> }</a> +<a class="sourceLine" id="cb577-25" data-line-number="25">}</a> +<a class="sourceLine" id="cb577-26" data-line-number="26"></a> +<a class="sourceLine" id="cb577-27" data-line-number="27">indi1 <-<span class="st"> </span><span class="kw">do_mnn</span>(reads.qc[, reads.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19098"</span>])</a> +<a class="sourceLine" id="cb577-28" data-line-number="28">indi2 <-<span class="st"> </span><span class="kw">do_mnn</span>(reads.qc[, reads.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19101"</span>])</a> +<a class="sourceLine" id="cb577-29" data-line-number="29">indi3 <-<span class="st"> </span><span class="kw">do_mnn</span>(reads.qc[, reads.qc<span class="op">$</span>individual <span class="op">==</span><span class="st"> "NA19239"</span>])</a> +<a class="sourceLine" id="cb577-30" data-line-number="30"></a> +<a class="sourceLine" id="cb577-31" data-line-number="31"><span class="kw">assay</span>(reads.qc, <span class="st">"mnn"</span>) <-<span class="st"> </span><span class="kw">cbind</span>(indi1, indi2, indi3)</a> +<a class="sourceLine" id="cb577-32" data-line-number="32"></a> +<a class="sourceLine" id="cb577-33" data-line-number="33"><span class="co"># For a balanced design: </span></a> +<a class="sourceLine" id="cb577-34" data-line-number="34"><span class="co">#assay(reads.qc, "mnn") <- mnnCorrect(</span></a> +<a class="sourceLine" id="cb577-35" data-line-number="35"><span class="co"># list(B1 = logcounts(batch1), B2 = logcounts(batch2), B3 = logcounts(batch3)), </span></a> +<a class="sourceLine" id="cb577-36" data-line-number="36"><span class="co"># k = 20,</span></a> +<a class="sourceLine" id="cb577-37" data-line-number="37"><span class="co"># sigma = 0.1,</span></a> +<a class="sourceLine" id="cb577-38" data-line-number="38"><span class="co"># cos.norm = TRUE,</span></a> +<a class="sourceLine" id="cb577-39" data-line-number="39"><span class="co"># svd.dim = 2</span></a> +<a class="sourceLine" id="cb577-40" data-line-number="40"><span class="co">#)</span></a></code></pre></div> +<div class="sourceCode" id="cb578"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb578-1" data-line-number="1">glm_fun <-<span class="st"> </span><span class="cf">function</span>(g, batch, indi) {</a> +<a class="sourceLine" id="cb578-2" data-line-number="2"> model <-<span class="st"> </span><span class="kw">glm</span>(g <span class="op">~</span><span class="st"> </span>batch <span class="op">+</span><span class="st"> </span>indi)</a> +<a class="sourceLine" id="cb578-3" data-line-number="3"> model<span class="op">$</span>coef[<span class="dv">1</span>] <-<span class="st"> </span><span class="dv">0</span> <span class="co"># replace intercept with 0 to preserve reference batch.</span></a> +<a class="sourceLine" id="cb578-4" data-line-number="4"> <span class="kw">return</span>(model<span class="op">$</span>coef)</a> +<a class="sourceLine" id="cb578-5" data-line-number="5">}</a> +<a class="sourceLine" id="cb578-6" data-line-number="6">effects <-<span class="st"> </span><span class="kw">apply</span>(</a> +<a class="sourceLine" id="cb578-7" data-line-number="7"> <span class="kw">logcounts</span>(reads.qc), </a> +<a class="sourceLine" id="cb578-8" data-line-number="8"> <span class="dv">1</span>,</a> +<a class="sourceLine" id="cb578-9" data-line-number="9"> glm_fun, </a> +<a class="sourceLine" id="cb578-10" data-line-number="10"> <span class="dt">batch =</span> reads.qc<span class="op">$</span>batch, </a> +<a class="sourceLine" id="cb578-11" data-line-number="11"> <span class="dt">indi =</span> reads.qc<span class="op">$</span>individual</a> +<a class="sourceLine" id="cb578-12" data-line-number="12">)</a> +<a class="sourceLine" id="cb578-13" data-line-number="13">corrected <-<span class="st"> </span><span class="kw">logcounts</span>(reads.qc) <span class="op">-</span><span class="st"> </span><span class="kw">t</span>(effects[<span class="kw">as.numeric</span>(<span class="kw">factor</span>(reads.qc<span class="op">$</span>batch)), ])</a> +<a class="sourceLine" id="cb578-14" data-line-number="14"><span class="kw">assay</span>(reads.qc, <span class="st">"glm"</span>) <-<span class="st"> </span>corrected</a></code></pre></div> <p><strong>Exercise 2</strong></p> -<div class="sourceCode" id="cb535"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb535-1" data-line-number="1">reads.qc.endog =<span class="st"> </span>reads.qc[endog_genes,]</a> -<a class="sourceLine" id="cb535-2" data-line-number="2">reads.qc.endog =<span class="st"> </span><span class="kw">runPCA</span>(reads.qc.endog, <span class="dt">exprs_values =</span> <span class="st">'logcounts'</span>, <span class="dt">ncomponents =</span> <span class="dv">20</span>)</a> -<a class="sourceLine" id="cb535-3" data-line-number="3">pca <-<span class="st"> </span><span class="kw">as.matrix</span>(reads.qc.endog<span class="op">@</span>reducedDims<span class="op">@</span>listData[[<span class="st">"PCA"</span>]])</a> -<a class="sourceLine" id="cb535-4" data-line-number="4">harmony_emb <-<span class="st"> </span><span class="kw">HarmonyMatrix</span>(pca, reads.qc.endog<span class="op">$</span>batch, <span class="dt">theta=</span><span class="dv">2</span>, <span class="dt">do_pca=</span><span class="ot">FALSE</span>)</a> -<a class="sourceLine" id="cb535-5" data-line-number="5">reads.qc.endog<span class="op">@</span>reducedDims<span class="op">@</span>listData[[<span class="st">'harmony'</span>]] <-<span class="st"> </span>harmony_emb</a> -<a class="sourceLine" id="cb535-6" data-line-number="6"></a> -<a class="sourceLine" id="cb535-7" data-line-number="7"><span class="kw">plotReducedDim</span>(</a> -<a class="sourceLine" id="cb535-8" data-line-number="8"> reads.qc.endog,</a> -<a class="sourceLine" id="cb535-9" data-line-number="9"> <span class="dt">use_dimred =</span> <span class="st">'harmony'</span>,</a> -<a class="sourceLine" id="cb535-10" data-line-number="10"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> -<a class="sourceLine" id="cb535-11" data-line-number="11"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb535-12" data-line-number="12"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> -<a class="sourceLine" id="cb535-13" data-line-number="13">)</a></code></pre></div> -<div class="sourceCode" id="cb536"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb536-1" data-line-number="1"><span class="cf">for</span>(n <span class="cf">in</span> <span class="kw">assayNames</span>(reads.qc)) {</a> -<a class="sourceLine" id="cb536-2" data-line-number="2"> tmp <-<span class="st"> </span><span class="kw">runPCA</span>(</a> -<a class="sourceLine" id="cb536-3" data-line-number="3"> reads.qc[endog_genes, ],</a> -<a class="sourceLine" id="cb536-4" data-line-number="4"> <span class="dt">exprs_values =</span> n</a> -<a class="sourceLine" id="cb536-5" data-line-number="5"> )</a> -<a class="sourceLine" id="cb536-6" data-line-number="6"> <span class="kw">print</span>(</a> -<a class="sourceLine" id="cb536-7" data-line-number="7"> <span class="kw">plotPCA</span>(</a> -<a class="sourceLine" id="cb536-8" data-line-number="8"> tmp,</a> -<a class="sourceLine" id="cb536-9" data-line-number="9"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> -<a class="sourceLine" id="cb536-10" data-line-number="10"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb536-11" data-line-number="11"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> -<a class="sourceLine" id="cb536-12" data-line-number="12"> ) <span class="op">+</span></a> -<a class="sourceLine" id="cb536-13" data-line-number="13"><span class="st"> </span><span class="kw">ggtitle</span>(n)</a> -<a class="sourceLine" id="cb536-14" data-line-number="14"> )</a> -<a class="sourceLine" id="cb536-15" data-line-number="15">}</a></code></pre></div> -<div class="sourceCode" id="cb537"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb537-1" data-line-number="1">res <-<span class="st"> </span><span class="kw">list</span>()</a> -<a class="sourceLine" id="cb537-2" data-line-number="2"><span class="cf">for</span>(n <span class="cf">in</span> <span class="kw">assayNames</span>(reads.qc)) {</a> -<a class="sourceLine" id="cb537-3" data-line-number="3"> res[[n]] <-<span class="st"> </span><span class="kw">suppressWarnings</span>(<span class="kw">calc_cell_RLE</span>(<span class="kw">assay</span>(reads.qc, n), erccs))</a> -<a class="sourceLine" id="cb537-4" data-line-number="4">}</a> -<a class="sourceLine" id="cb537-5" data-line-number="5"><span class="kw">par</span>(<span class="dt">mar=</span><span class="kw">c</span>(<span class="dv">6</span>,<span class="dv">4</span>,<span class="dv">1</span>,<span class="dv">1</span>))</a> -<a class="sourceLine" id="cb537-6" data-line-number="6"><span class="kw">boxplot</span>(res, <span class="dt">las=</span><span class="dv">2</span>)</a></code></pre></div> -<div class="sourceCode" id="cb538"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb538-1" data-line-number="1">compare_kBET_results <-<span class="st"> </span><span class="cf">function</span>(sce){</a> -<a class="sourceLine" id="cb538-2" data-line-number="2"> indiv <-<span class="st"> </span><span class="kw">unique</span>(sce<span class="op">$</span>individual)</a> -<a class="sourceLine" id="cb538-3" data-line-number="3"> norms <-<span class="st"> </span><span class="kw">assayNames</span>(sce) <span class="co"># Get all normalizations</span></a> -<a class="sourceLine" id="cb538-4" data-line-number="4"> results <-<span class="st"> </span><span class="kw">list</span>()</a> -<a class="sourceLine" id="cb538-5" data-line-number="5"> <span class="cf">for</span> (i <span class="cf">in</span> indiv){ </a> -<a class="sourceLine" id="cb538-6" data-line-number="6"> <span class="cf">for</span> (j <span class="cf">in</span> norms){</a> -<a class="sourceLine" id="cb538-7" data-line-number="7"> tmp <-<span class="st"> </span><span class="kw">kBET</span>(</a> -<a class="sourceLine" id="cb538-8" data-line-number="8"> <span class="dt">df =</span> <span class="kw">t</span>(<span class="kw">assay</span>(sce[,sce<span class="op">$</span>individual<span class="op">==</span><span class="st"> </span>i], j)), </a> -<a class="sourceLine" id="cb538-9" data-line-number="9"> <span class="dt">batch =</span> sce<span class="op">$</span>batch[sce<span class="op">$</span>individual<span class="op">==</span>i], </a> -<a class="sourceLine" id="cb538-10" data-line-number="10"> <span class="dt">heuristic =</span> <span class="ot">TRUE</span>, </a> -<a class="sourceLine" id="cb538-11" data-line-number="11"> <span class="dt">verbose =</span> <span class="ot">FALSE</span>, </a> -<a class="sourceLine" id="cb538-12" data-line-number="12"> <span class="dt">addTest =</span> <span class="ot">FALSE</span>, </a> -<a class="sourceLine" id="cb538-13" data-line-number="13"> <span class="dt">plot =</span> <span class="ot">FALSE</span>)</a> -<a class="sourceLine" id="cb538-14" data-line-number="14"> results[[i]][[j]] <-<span class="st"> </span>tmp<span class="op">$</span>summary<span class="op">$</span>kBET.observed[<span class="dv">1</span>]</a> -<a class="sourceLine" id="cb538-15" data-line-number="15"> }</a> -<a class="sourceLine" id="cb538-16" data-line-number="16"> }</a> -<a class="sourceLine" id="cb538-17" data-line-number="17"> <span class="kw">return</span>(<span class="kw">as.data.frame</span>(results))</a> -<a class="sourceLine" id="cb538-18" data-line-number="18">}</a> -<a class="sourceLine" id="cb538-19" data-line-number="19"></a> -<a class="sourceLine" id="cb538-20" data-line-number="20">eff_debatching <-<span class="st"> </span><span class="kw">compare_kBET_results</span>(reads.qc)</a></code></pre></div> -<div class="sourceCode" id="cb539"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb539-1" data-line-number="1"><span class="kw">require</span>(<span class="st">"reshape2"</span>)</a> -<a class="sourceLine" id="cb539-2" data-line-number="2"><span class="kw">require</span>(<span class="st">"RColorBrewer"</span>)</a> -<a class="sourceLine" id="cb539-3" data-line-number="3"><span class="co"># Plot results</span></a> -<a class="sourceLine" id="cb539-4" data-line-number="4">dod <-<span class="st"> </span><span class="kw">melt</span>(<span class="kw">as.matrix</span>(eff_debatching), <span class="dt">value.name =</span> <span class="st">"kBET"</span>)</a> -<a class="sourceLine" id="cb539-5" data-line-number="5"><span class="kw">colnames</span>(dod)[<span class="dv">1</span><span class="op">:</span><span class="dv">2</span>] <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"Normalisation"</span>, <span class="st">"Individual"</span>)</a> -<a class="sourceLine" id="cb539-6" data-line-number="6"></a> -<a class="sourceLine" id="cb539-7" data-line-number="7">colorset <-<span class="st"> </span><span class="kw">c</span>(<span class="st">'gray'</span>, <span class="kw">brewer.pal</span>(<span class="dt">n =</span> <span class="dv">9</span>, <span class="st">"RdYlBu"</span>))</a> -<a class="sourceLine" id="cb539-8" data-line-number="8"></a> -<a class="sourceLine" id="cb539-9" data-line-number="9"><span class="kw">ggplot</span>(dod, <span class="kw">aes</span>(Normalisation, Individual, <span class="dt">fill=</span>kBET)) <span class="op">+</span><span class="st"> </span></a> -<a class="sourceLine" id="cb539-10" data-line-number="10"><span class="st"> </span><span class="kw">geom_tile</span>() <span class="op">+</span></a> -<a class="sourceLine" id="cb539-11" data-line-number="11"><span class="st"> </span><span class="kw">scale_fill_gradient2</span>(</a> -<a class="sourceLine" id="cb539-12" data-line-number="12"> <span class="dt">na.value =</span> <span class="st">"gray"</span>,</a> -<a class="sourceLine" id="cb539-13" data-line-number="13"> <span class="dt">low =</span> colorset[<span class="dv">2</span>],</a> -<a class="sourceLine" id="cb539-14" data-line-number="14"> <span class="dt">mid=</span>colorset[<span class="dv">6</span>],</a> -<a class="sourceLine" id="cb539-15" data-line-number="15"> <span class="dt">high =</span> colorset[<span class="dv">10</span>],</a> -<a class="sourceLine" id="cb539-16" data-line-number="16"> <span class="dt">midpoint =</span> <span class="fl">0.5</span>, <span class="dt">limit =</span> <span class="kw">c</span>(<span class="dv">0</span>,<span class="dv">1</span>)) <span class="op">+</span></a> -<a class="sourceLine" id="cb539-17" data-line-number="17"><span class="st"> </span><span class="kw">scale_x_discrete</span>(<span class="dt">expand =</span> <span class="kw">c</span>(<span class="dv">0</span>, <span class="dv">0</span>)) <span class="op">+</span></a> -<a class="sourceLine" id="cb539-18" data-line-number="18"><span class="st"> </span><span class="kw">scale_y_discrete</span>(<span class="dt">expand =</span> <span class="kw">c</span>(<span class="dv">0</span>, <span class="dv">0</span>)) <span class="op">+</span><span class="st"> </span></a> -<a class="sourceLine" id="cb539-19" data-line-number="19"><span class="st"> </span><span class="kw">theme</span>(</a> -<a class="sourceLine" id="cb539-20" data-line-number="20"> <span class="dt">axis.text.x =</span> <span class="kw">element_text</span>(</a> -<a class="sourceLine" id="cb539-21" data-line-number="21"> <span class="dt">angle =</span> <span class="dv">45</span>, </a> -<a class="sourceLine" id="cb539-22" data-line-number="22"> <span class="dt">vjust =</span> <span class="dv">1</span>, </a> -<a class="sourceLine" id="cb539-23" data-line-number="23"> <span class="dt">size =</span> <span class="dv">12</span>, </a> -<a class="sourceLine" id="cb539-24" data-line-number="24"> <span class="dt">hjust =</span> <span class="dv">1</span></a> -<a class="sourceLine" id="cb539-25" data-line-number="25"> )</a> -<a class="sourceLine" id="cb539-26" data-line-number="26"> ) <span class="op">+</span><span class="st"> </span></a> -<a class="sourceLine" id="cb539-27" data-line-number="27"><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"Effect of batch regression methods per individual"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb579"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb579-1" data-line-number="1">reads.qc.endog =<span class="st"> </span>reads.qc[endog_genes,]</a> +<a class="sourceLine" id="cb579-2" data-line-number="2">reads.qc.endog =<span class="st"> </span><span class="kw">runPCA</span>(reads.qc.endog, <span class="dt">exprs_values =</span> <span class="st">'logcounts'</span>, <span class="dt">ncomponents =</span> <span class="dv">20</span>)</a> +<a class="sourceLine" id="cb579-3" data-line-number="3">pca <-<span class="st"> </span><span class="kw">as.matrix</span>(reads.qc.endog<span class="op">@</span>reducedDims<span class="op">@</span>listData[[<span class="st">"PCA"</span>]])</a> +<a class="sourceLine" id="cb579-4" data-line-number="4">harmony_emb <-<span class="st"> </span><span class="kw">HarmonyMatrix</span>(pca, reads.qc.endog<span class="op">$</span>batch, <span class="dt">theta=</span><span class="dv">2</span>, <span class="dt">do_pca=</span><span class="ot">FALSE</span>)</a> +<a class="sourceLine" id="cb579-5" data-line-number="5">reads.qc.endog<span class="op">@</span>reducedDims<span class="op">@</span>listData[[<span class="st">'harmony'</span>]] <-<span class="st"> </span>harmony_emb</a> +<a class="sourceLine" id="cb579-6" data-line-number="6"></a> +<a class="sourceLine" id="cb579-7" data-line-number="7"><span class="kw">plotReducedDim</span>(</a> +<a class="sourceLine" id="cb579-8" data-line-number="8"> reads.qc.endog,</a> +<a class="sourceLine" id="cb579-9" data-line-number="9"> <span class="dt">use_dimred =</span> <span class="st">'harmony'</span>,</a> +<a class="sourceLine" id="cb579-10" data-line-number="10"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> +<a class="sourceLine" id="cb579-11" data-line-number="11"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb579-12" data-line-number="12"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> +<a class="sourceLine" id="cb579-13" data-line-number="13">)</a></code></pre></div> +<div class="sourceCode" id="cb580"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb580-1" data-line-number="1"><span class="cf">for</span>(n <span class="cf">in</span> <span class="kw">assayNames</span>(reads.qc)) {</a> +<a class="sourceLine" id="cb580-2" data-line-number="2"> tmp <-<span class="st"> </span><span class="kw">runPCA</span>(</a> +<a class="sourceLine" id="cb580-3" data-line-number="3"> reads.qc[endog_genes, ],</a> +<a class="sourceLine" id="cb580-4" data-line-number="4"> <span class="dt">exprs_values =</span> n</a> +<a class="sourceLine" id="cb580-5" data-line-number="5"> )</a> +<a class="sourceLine" id="cb580-6" data-line-number="6"> <span class="kw">print</span>(</a> +<a class="sourceLine" id="cb580-7" data-line-number="7"> <span class="kw">plotPCA</span>(</a> +<a class="sourceLine" id="cb580-8" data-line-number="8"> tmp,</a> +<a class="sourceLine" id="cb580-9" data-line-number="9"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> +<a class="sourceLine" id="cb580-10" data-line-number="10"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb580-11" data-line-number="11"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> +<a class="sourceLine" id="cb580-12" data-line-number="12"> ) <span class="op">+</span></a> +<a class="sourceLine" id="cb580-13" data-line-number="13"><span class="st"> </span><span class="kw">ggtitle</span>(n)</a> +<a class="sourceLine" id="cb580-14" data-line-number="14"> )</a> +<a class="sourceLine" id="cb580-15" data-line-number="15">}</a></code></pre></div> +<div class="sourceCode" id="cb581"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb581-1" data-line-number="1">res <-<span class="st"> </span><span class="kw">list</span>()</a> +<a class="sourceLine" id="cb581-2" data-line-number="2"><span class="cf">for</span>(n <span class="cf">in</span> <span class="kw">assayNames</span>(reads.qc)) {</a> +<a class="sourceLine" id="cb581-3" data-line-number="3"> res[[n]] <-<span class="st"> </span><span class="kw">suppressWarnings</span>(<span class="kw">calc_cell_RLE</span>(<span class="kw">assay</span>(reads.qc, n), erccs))</a> +<a class="sourceLine" id="cb581-4" data-line-number="4">}</a> +<a class="sourceLine" id="cb581-5" data-line-number="5"><span class="kw">par</span>(<span class="dt">mar=</span><span class="kw">c</span>(<span class="dv">6</span>,<span class="dv">4</span>,<span class="dv">1</span>,<span class="dv">1</span>))</a> +<a class="sourceLine" id="cb581-6" data-line-number="6"><span class="kw">boxplot</span>(res, <span class="dt">las=</span><span class="dv">2</span>)</a></code></pre></div> +<div class="sourceCode" id="cb582"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb582-1" data-line-number="1">compare_kBET_results <-<span class="st"> </span><span class="cf">function</span>(sce){</a> +<a class="sourceLine" id="cb582-2" data-line-number="2"> indiv <-<span class="st"> </span><span class="kw">unique</span>(sce<span class="op">$</span>individual)</a> +<a class="sourceLine" id="cb582-3" data-line-number="3"> norms <-<span class="st"> </span><span class="kw">assayNames</span>(sce) <span class="co"># Get all normalizations</span></a> +<a class="sourceLine" id="cb582-4" data-line-number="4"> results <-<span class="st"> </span><span class="kw">list</span>()</a> +<a class="sourceLine" id="cb582-5" data-line-number="5"> <span class="cf">for</span> (i <span class="cf">in</span> indiv){ </a> +<a class="sourceLine" id="cb582-6" data-line-number="6"> <span class="cf">for</span> (j <span class="cf">in</span> norms){</a> +<a class="sourceLine" id="cb582-7" data-line-number="7"> tmp <-<span class="st"> </span><span class="kw">kBET</span>(</a> +<a class="sourceLine" id="cb582-8" data-line-number="8"> <span class="dt">df =</span> <span class="kw">t</span>(<span class="kw">assay</span>(sce[,sce<span class="op">$</span>individual<span class="op">==</span><span class="st"> </span>i], j)), </a> +<a class="sourceLine" id="cb582-9" data-line-number="9"> <span class="dt">batch =</span> sce<span class="op">$</span>batch[sce<span class="op">$</span>individual<span class="op">==</span>i], </a> +<a class="sourceLine" id="cb582-10" data-line-number="10"> <span class="dt">heuristic =</span> <span class="ot">TRUE</span>, </a> +<a class="sourceLine" id="cb582-11" data-line-number="11"> <span class="dt">verbose =</span> <span class="ot">FALSE</span>, </a> +<a class="sourceLine" id="cb582-12" data-line-number="12"> <span class="dt">addTest =</span> <span class="ot">FALSE</span>, </a> +<a class="sourceLine" id="cb582-13" data-line-number="13"> <span class="dt">plot =</span> <span class="ot">FALSE</span>)</a> +<a class="sourceLine" id="cb582-14" data-line-number="14"> results[[i]][[j]] <-<span class="st"> </span>tmp<span class="op">$</span>summary<span class="op">$</span>kBET.observed[<span class="dv">1</span>]</a> +<a class="sourceLine" id="cb582-15" data-line-number="15"> }</a> +<a class="sourceLine" id="cb582-16" data-line-number="16"> }</a> +<a class="sourceLine" id="cb582-17" data-line-number="17"> <span class="kw">return</span>(<span class="kw">as.data.frame</span>(results))</a> +<a class="sourceLine" id="cb582-18" data-line-number="18">}</a> +<a class="sourceLine" id="cb582-19" data-line-number="19"></a> +<a class="sourceLine" id="cb582-20" data-line-number="20">eff_debatching <-<span class="st"> </span><span class="kw">compare_kBET_results</span>(reads.qc)</a></code></pre></div> +<div class="sourceCode" id="cb583"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb583-1" data-line-number="1"><span class="kw">require</span>(<span class="st">"reshape2"</span>)</a> +<a class="sourceLine" id="cb583-2" data-line-number="2"><span class="kw">require</span>(<span class="st">"RColorBrewer"</span>)</a> +<a class="sourceLine" id="cb583-3" data-line-number="3"><span class="co"># Plot results</span></a> +<a class="sourceLine" id="cb583-4" data-line-number="4">dod <-<span class="st"> </span><span class="kw">melt</span>(<span class="kw">as.matrix</span>(eff_debatching), <span class="dt">value.name =</span> <span class="st">"kBET"</span>)</a> +<a class="sourceLine" id="cb583-5" data-line-number="5"><span class="kw">colnames</span>(dod)[<span class="dv">1</span><span class="op">:</span><span class="dv">2</span>] <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"Normalisation"</span>, <span class="st">"Individual"</span>)</a> +<a class="sourceLine" id="cb583-6" data-line-number="6"></a> +<a class="sourceLine" id="cb583-7" data-line-number="7">colorset <-<span class="st"> </span><span class="kw">c</span>(<span class="st">'gray'</span>, <span class="kw">brewer.pal</span>(<span class="dt">n =</span> <span class="dv">9</span>, <span class="st">"RdYlBu"</span>))</a> +<a class="sourceLine" id="cb583-8" data-line-number="8"></a> +<a class="sourceLine" id="cb583-9" data-line-number="9"><span class="kw">ggplot</span>(dod, <span class="kw">aes</span>(Normalisation, Individual, <span class="dt">fill=</span>kBET)) <span class="op">+</span><span class="st"> </span></a> +<a class="sourceLine" id="cb583-10" data-line-number="10"><span class="st"> </span><span class="kw">geom_tile</span>() <span class="op">+</span></a> +<a class="sourceLine" id="cb583-11" data-line-number="11"><span class="st"> </span><span class="kw">scale_fill_gradient2</span>(</a> +<a class="sourceLine" id="cb583-12" data-line-number="12"> <span class="dt">na.value =</span> <span class="st">"gray"</span>,</a> +<a class="sourceLine" id="cb583-13" data-line-number="13"> <span class="dt">low =</span> colorset[<span class="dv">2</span>],</a> +<a class="sourceLine" id="cb583-14" data-line-number="14"> <span class="dt">mid=</span>colorset[<span class="dv">6</span>],</a> +<a class="sourceLine" id="cb583-15" data-line-number="15"> <span class="dt">high =</span> colorset[<span class="dv">10</span>],</a> +<a class="sourceLine" id="cb583-16" data-line-number="16"> <span class="dt">midpoint =</span> <span class="fl">0.5</span>, <span class="dt">limit =</span> <span class="kw">c</span>(<span class="dv">0</span>,<span class="dv">1</span>)) <span class="op">+</span></a> +<a class="sourceLine" id="cb583-17" data-line-number="17"><span class="st"> </span><span class="kw">scale_x_discrete</span>(<span class="dt">expand =</span> <span class="kw">c</span>(<span class="dv">0</span>, <span class="dv">0</span>)) <span class="op">+</span></a> +<a class="sourceLine" id="cb583-18" data-line-number="18"><span class="st"> </span><span class="kw">scale_y_discrete</span>(<span class="dt">expand =</span> <span class="kw">c</span>(<span class="dv">0</span>, <span class="dv">0</span>)) <span class="op">+</span><span class="st"> </span></a> +<a class="sourceLine" id="cb583-19" data-line-number="19"><span class="st"> </span><span class="kw">theme</span>(</a> +<a class="sourceLine" id="cb583-20" data-line-number="20"> <span class="dt">axis.text.x =</span> <span class="kw">element_text</span>(</a> +<a class="sourceLine" id="cb583-21" data-line-number="21"> <span class="dt">angle =</span> <span class="dv">45</span>, </a> +<a class="sourceLine" id="cb583-22" data-line-number="22"> <span class="dt">vjust =</span> <span class="dv">1</span>, </a> +<a class="sourceLine" id="cb583-23" data-line-number="23"> <span class="dt">size =</span> <span class="dv">12</span>, </a> +<a class="sourceLine" id="cb583-24" data-line-number="24"> <span class="dt">hjust =</span> <span class="dv">1</span></a> +<a class="sourceLine" id="cb583-25" data-line-number="25"> )</a> +<a class="sourceLine" id="cb583-26" data-line-number="26"> ) <span class="op">+</span><span class="st"> </span></a> +<a class="sourceLine" id="cb583-27" data-line-number="27"><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"Effect of batch regression methods per individual"</span>)</a></code></pre></div> </div> <div id="feature-selection" class="section level2"> <h2><span class="header-section-number">7.8</span> Feature Selection</h2> -<div class="sourceCode" id="cb540"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb540-1" data-line-number="1"><span class="kw">library</span>(scRNA.seq.funcs)</a> -<a class="sourceLine" id="cb540-2" data-line-number="2"><span class="kw">library</span>(matrixStats)</a> -<a class="sourceLine" id="cb540-3" data-line-number="3"><span class="kw">library</span>(M3Drop)</a> -<a class="sourceLine" id="cb540-4" data-line-number="4"><span class="kw">library</span>(RColorBrewer)</a> -<a class="sourceLine" id="cb540-5" data-line-number="5"><span class="kw">library</span>(SingleCellExperiment)</a> -<a class="sourceLine" id="cb540-6" data-line-number="6"><span class="kw">set.seed</span>(<span class="dv">1</span>)</a></code></pre></div> +<div class="sourceCode" id="cb584"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb584-1" data-line-number="1"><span class="kw">library</span>(scRNA.seq.funcs)</a> +<a class="sourceLine" id="cb584-2" data-line-number="2"><span class="kw">library</span>(matrixStats)</a> +<a class="sourceLine" id="cb584-3" data-line-number="3"><span class="kw">library</span>(M3Drop)</a> +<a class="sourceLine" id="cb584-4" data-line-number="4"><span class="kw">library</span>(RColorBrewer)</a> +<a class="sourceLine" id="cb584-5" data-line-number="5"><span class="kw">library</span>(SingleCellExperiment)</a> +<a class="sourceLine" id="cb584-6" data-line-number="6"><span class="kw">library</span>(Polychrome)</a> +<a class="sourceLine" id="cb584-7" data-line-number="7"><span class="kw">library</span>(scater)</a> +<a class="sourceLine" id="cb584-8" data-line-number="8"><span class="kw">library</span>(scran)</a> +<a class="sourceLine" id="cb584-9" data-line-number="9"><span class="kw">set.seed</span>(<span class="dv">1</span>)</a></code></pre></div> <p>Single-cell RNASeq is capable of measuring the expression of many thousands of genes in every cell. However, in most situations only a portion of those will show a response to the biological condition of @@ -5894,9 +6932,10 @@ can be considered a form of supervised feature selection since it uses the known biological label of each sample to identify features (i.e. genes) which are expressed at different levels across groups.</p> <p>For this section we will continue working with the Deng data.</p> -<div class="sourceCode" id="cb541"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb541-1" data-line-number="1">deng <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/deng/deng-reads.rds"</span>)</a> -<a class="sourceLine" id="cb541-2" data-line-number="2">celltype_labs <-<span class="st"> </span><span class="kw">colData</span>(deng)<span class="op">$</span>cell_type2</a> -<a class="sourceLine" id="cb541-3" data-line-number="3">cell_colors <-<span class="st"> </span><span class="kw">brewer.pal</span>(<span class="kw">max</span>(<span class="dv">3</span>,<span class="kw">length</span>(<span class="kw">unique</span>(celltype_labs))), <span class="st">"Set3"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb585"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb585-1" data-line-number="1">deng <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/deng/deng-reads.rds"</span>)</a> +<a class="sourceLine" id="cb585-2" data-line-number="2">celltype_labs <-<span class="st"> </span><span class="kw">colData</span>(deng)<span class="op">$</span>cell_type2</a> +<a class="sourceLine" id="cb585-3" data-line-number="3">cell_colors <-<span class="st"> </span><span class="kw">createPalette</span>(<span class="dv">10</span>, <span class="kw">c</span>(<span class="st">"#010101"</span>, <span class="st">"#ff0000"</span>), <span class="dt">M=</span><span class="dv">1000</span>)</a> +<a class="sourceLine" id="cb585-4" data-line-number="4"><span class="kw">names</span>(cell_colors) <-<span class="st"> </span><span class="kw">unique</span>(<span class="kw">as.character</span>(celltype_labs))</a></code></pre></div> <p>Feature selection is performed after QC, however this data has already been QCed so we can skip that step here. M3Drop contain two different feature selection methods “M3DropFeatureSelection†which is based on a Michaelis-Menten curve and is designed @@ -5906,12 +6945,13 @@ is designed for UMI count data. We will demonstrate both on the Deng Smartseq2 d <p>M3Drop feature selection is runs direction on a normalized (but not log-transformed) expression matrix. This can be extracted from our SingleCellExperiment object using the command below.</p> -<div class="sourceCode" id="cb542"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb542-1" data-line-number="1">expr_matrix <-<span class="st"> </span>M3Drop<span class="op">::</span><span class="kw">M3DropConvertData</span>(deng)</a></code></pre></div> +<div class="sourceCode" id="cb586"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb586-1" data-line-number="1">expr_matrix <-<span class="st"> </span>M3Drop<span class="op">::</span><span class="kw">M3DropConvertData</span>(deng)</a></code></pre></div> +<pre><code>## [1] "Removing 1134 undetected genes."</code></pre> <p>This function is compatible with most single-cell RNA-seq analysis packages including: scater, SingleCellExperiment, monocle, and Seurat. It can also convert an existing expression matrix to the correct form (removing undetected genes & normalizing/delogging) if you specify whether the matrix is raw counts, or log transformed. Check the manual for details:</p> -<div class="sourceCode" id="cb543"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb543-1" data-line-number="1">?M3Drop<span class="op">::</span>M3DropConvertData</a></code></pre></div> +<div class="sourceCode" id="cb588"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb588-1" data-line-number="1">?M3Drop<span class="op">::</span>M3DropConvertData</a></code></pre></div> <p><strong>Exercise 1</strong>: Confirm that the conversion function has removed undetected genes:</p> <div id="identifying-genes-vs-a-null-model" class="section level3"> <h3><span class="header-section-number">7.8.1</span> Identifying Genes vs a Null Model</h3> @@ -5919,12 +6959,13 @@ if you specify whether the matrix is raw counts, or log transformed. Check the m first is to identify genes which behave differently from a null model describing just the technical noise expected in the dataset.</p> <p>If the dataset contains spike-in RNAs they can be used to directly model -technical noise. However, measurements of spike-ins may not experience -the same technical noise as endogenous transcripts <a href="https://www.nature.com/nmeth/journal/v14/n4/full/nmeth.4220.html">(Svensson et al., 2017)</a>. -In addition, scRNASeq experiments often contain only a small number of -spike-ins which reduces our confidence in fitted model parameters.</p> -<div id="highly-variable-genes" class="section level4"> -<h4><span class="header-section-number">7.8.1.1</span> Highly Variable Genes</h4> +technical noise. However, measurements of spike-ins may not experience the same +technical noise as endogenous transcripts <a href="https://www.nature.com/nmeth/journal/v14/n4/full/nmeth.4220.html">(Svensson et al., +2017)</a>. In +addition, scRNASeq experiments often contain only a small number of spike-ins +which reduces our confidence in fitted model parameters.</p> +<div id="highly-variable-genes---brennecke-method" class="section level4"> +<h4><span class="header-section-number">7.8.1.1</span> Highly Variable Genes - Brennecke method</h4> <p>The first method proposed to identify features in scRNASeq datasets was to identify highly variable genes (HVG). HVG assumes that if genes have large differences in expression across cells some of those differences @@ -5935,86 +6976,157 @@ cells. This relationship must be corrected for to properly identify HVGs.</p> <p><strong>Exercise 2</strong> Using the functions rowMeans and rowVars to plot the relationship between mean expression and variance for all genes in this dataset. (Hint: use log=“xy†to plot on a log-scale).</p> -<p>A popular method to correct for the relationship between variance and mean expression -was proposed by <a href="http://www.nature.com/nmeth/journal/v10/n11/full/nmeth.2645.html">Brennecke et al.</a>. -To use the Brennecke method, we first normalize for library size then calculate -the mean and the square coefficient of variation (variation divided by -the squared mean expression). A quadratic curve is fit to the relationship -between these two variables for the ERCC spike-in, and then a chi-square test is used to find genes -significantly above the curve. This method is included in the M3Drop package as the -Brennecke_getVariableGenes(counts, spikes) function. However, this dataset does not contain spike-ins -so we will use the entire dataset to estimate the technical noise.</p> -<p>In the figure below the red curve -is the fitted technical noise model and the dashed line is the 95% -CI. Pink dots are the genes with significant biological variability -after multiple-testing correction.</p> -<div class="sourceCode" id="cb544"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb544-1" data-line-number="1">Brennecke_HVG <-<span class="st"> </span><span class="kw">BrenneckeGetVariableGenes</span>(</a> -<a class="sourceLine" id="cb544-2" data-line-number="2"> expr_matrix,</a> -<a class="sourceLine" id="cb544-3" data-line-number="3"> <span class="dt">fdr =</span> <span class="fl">0.01</span>,</a> -<a class="sourceLine" id="cb544-4" data-line-number="4"> <span class="dt">minBiolDisp =</span> <span class="fl">0.5</span></a> -<a class="sourceLine" id="cb544-5" data-line-number="5">)</a></code></pre></div> -<p>This function returns a matrix of significant genes as well as their estimated effect size (difference -between observed and expected coefficient of variation), and their significance as raw p.values and -FDR corrected q.values. For now we will just keep the names of the significant HVG genes.</p> -<div class="sourceCode" id="cb545"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb545-1" data-line-number="1">HVG_genes <-<span class="st"> </span>Brennecke_HVG<span class="op">$</span>Gene</a></code></pre></div> +<p><img src="feature-selection_files/figure-html/unnamed-chunk-6-1.png" width="90%" style="display: block; margin: auto;" /></p> +<p>An early method to correct for the relationship between variance and mean +expression was proposed by <a href="http://www.nature.com/nmeth/journal/v10/n11/full/nmeth.2645.html">Brennecke et +al.</a>. To use +the Brennecke method, we first normalize for library size then calculate the +mean and the square coefficient of variation (variation divided by the squared +mean expression). A quadratic curve is fit to the relationship between these two +variables for the ERCC spike-in, and then a chi-square test is used to find +genes significantly above the curve. This method is included in the M3Drop +package as the Brennecke_getVariableGenes(counts, spikes) function. However, +this dataset does not contain spike-ins so we will use the entire dataset to +estimate the technical noise.</p> +<p>In the figure below the red curve is the fitted technical noise model and the +dashed line is the 95% CI. Pink dots are the genes with significant biological +variability after multiple-testing correction.</p> +<div class="sourceCode" id="cb589"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb589-1" data-line-number="1">Brennecke_HVG <-<span class="st"> </span><span class="kw">BrenneckeGetVariableGenes</span>(</a> +<a class="sourceLine" id="cb589-2" data-line-number="2"> expr_matrix,</a> +<a class="sourceLine" id="cb589-3" data-line-number="3"> <span class="dt">fdr =</span> <span class="fl">0.01</span>,</a> +<a class="sourceLine" id="cb589-4" data-line-number="4"> <span class="dt">minBiolDisp =</span> <span class="fl">0.5</span></a> +<a class="sourceLine" id="cb589-5" data-line-number="5">)</a></code></pre></div> +<p><img src="feature-selection_files/figure-html/unnamed-chunk-7-1.png" width="90%" style="display: block; margin: auto;" /></p> +<p>This function returns a matrix of significant genes as well as their estimated +effect size (difference between observed and expected coefficient of variation), +and their significance as raw p.values and FDR corrected q.values. For now we +will just keep the names of the significant HVG genes.</p> +<div class="sourceCode" id="cb590"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb590-1" data-line-number="1">HVG_genes <-<span class="st"> </span>Brennecke_HVG<span class="op">$</span>Gene</a></code></pre></div> <p><strong>Exercise 3</strong> How many genes were signifcant using BrenneckeGetVariableGenes?</p> +<pre><code>## [1] 1303</code></pre> +</div> +<div id="highly-variable-genes---simplesinglecell-method" class="section level4"> +<h4><span class="header-section-number">7.8.1.2</span> Highly Variable Genes - simpleSingleCell method</h4> +<p>The Bioconductor +<a href="https://bioconductor.org/packages/release/workflows/html/simpleSingleCell.html">simpleSingleCell</a> +workflow has a great deal of excellent material to help your analyses. Here, we +show how to identify highly variable genes using functionality from the <code>scran</code> +package.</p> +<p>This method assumes that technical variance is captured by a Poisson +distribution, and that variance beyond that explained by a Poisson distribution +represents biological variance of interest. This approach separates the +biological component of the variance from the technical component and thus can +rank genes based on their “biological†variance. This model also provides +p-values (with FDR adjustment) that can be used to identify the set of +“significant†highly variable genes at a given significance level.</p> +<div class="sourceCode" id="cb592"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb592-1" data-line-number="1"><span class="co">### mamke a technical trend of variance based on Poisson</span></a> +<a class="sourceLine" id="cb592-2" data-line-number="2">var.fit <-<span class="st"> </span><span class="kw">trendVar</span>(deng, <span class="dt">parametric=</span><span class="ot">TRUE</span>, <span class="dt">loess.args=</span><span class="kw">list</span>(<span class="dt">span=</span><span class="fl">0.4</span>), <span class="dt">use.spikes =</span> <span class="ot">FALSE</span>)</a> +<a class="sourceLine" id="cb592-3" data-line-number="3">var.out <-<span class="st"> </span><span class="kw">decomposeVar</span>(deng, var.fit)</a> +<a class="sourceLine" id="cb592-4" data-line-number="4"><span class="kw">plot</span>(var.out<span class="op">$</span>mean, var.out<span class="op">$</span>total, <span class="dt">pch=</span><span class="dv">16</span>, <span class="dt">cex=</span><span class="fl">0.6</span>, <span class="dt">xlab=</span><span class="st">"Mean log-expression"</span>, </a> +<a class="sourceLine" id="cb592-5" data-line-number="5"> <span class="dt">ylab=</span><span class="st">"Variance of log-expression"</span>)</a> +<a class="sourceLine" id="cb592-6" data-line-number="6"><span class="kw">points</span>(var.out<span class="op">$</span>mean[<span class="kw">isSpike</span>(deng)], var.out<span class="op">$</span>total[<span class="kw">isSpike</span>(deng)], <span class="dt">col=</span><span class="st">"red"</span>, <span class="dt">pch=</span><span class="dv">16</span>)</a> +<a class="sourceLine" id="cb592-7" data-line-number="7"><span class="kw">curve</span>(var.fit<span class="op">$</span><span class="kw">trend</span>(x), <span class="dt">col=</span><span class="st">"dodgerblue"</span>, <span class="dt">add=</span><span class="ot">TRUE</span>, <span class="dt">lwd=</span><span class="dv">2</span>)</a></code></pre></div> +<p><img src="feature-selection_files/figure-html/hvg-simpleSingleCell-1.png" width="90%" style="display: block; margin: auto;" /></p> +<div class="sourceCode" id="cb593"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb593-1" data-line-number="1">chosen.genes <-<span class="st"> </span><span class="kw">order</span>(var.out<span class="op">$</span>bio, <span class="dt">decreasing=</span><span class="ot">TRUE</span>)[<span class="dv">1</span><span class="op">:</span><span class="dv">10</span>]</a> +<a class="sourceLine" id="cb593-2" data-line-number="2"><span class="kw">plotExpression</span>(deng, <span class="kw">rownames</span>(var.out)[chosen.genes], </a> +<a class="sourceLine" id="cb593-3" data-line-number="3"> <span class="dt">point_alpha=</span><span class="fl">0.5</span>, <span class="dt">jitter_type=</span><span class="st">"jitter"</span>)</a></code></pre></div> +<p><img src="feature-selection_files/figure-html/hvg-simpleSingleCell-2.png" width="90%" style="display: block; margin: auto;" /></p> +<div class="sourceCode" id="cb594"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb594-1" data-line-number="1">top.dec <-<span class="st"> </span>var.out[<span class="kw">order</span>(var.out<span class="op">$</span>bio, <span class="dt">decreasing=</span><span class="ot">TRUE</span>),]</a> +<a class="sourceLine" id="cb594-2" data-line-number="2"> <span class="co"># the highly variable genes with largest biological components</span></a> +<a class="sourceLine" id="cb594-3" data-line-number="3"><span class="kw">head</span>(top.dec)</a></code></pre></div> +<pre><code>## DataFrame with 6 rows and 6 columns +## mean total bio +## <numeric> <numeric> <numeric> +## Obox6 7.0852220910669 39.7469062194493 27.7222625676479 +## BC053393 6.23846872763624 36.7868129334449 22.7409221497424 +## Krt18 8.06957111931139 30.7163256353151 21.3338604240051 +## Upp1 6.70443458808406 32.9196031154138 19.9537242012223 +## Akr1b8 9.31035205790714 25.9351262454146 19.563014227718 +## Spp1 5.52672835522051 34.8140952020968 19.5492807120572 +## tech p.value FDR +## <numeric> <numeric> <numeric> +## Obox6 12.0246436518013 6.67046481158613e-67 4.98750653962295e-64 +## BC053393 14.0458907837025 1.89687518927716e-40 5.90955657926056e-38 +## Krt18 9.38246521130992 1.28064383710762e-65 9.26649093876163e-63 +## Upp1 12.9658789141915 1.39045180596497e-37 3.89865305745004e-35 +## Akr1b8 6.37211201769662 2.70679041028919e-99 5.51963779029062e-96 +## Spp1 15.2648144900397 9.4641203490752e-29 1.76908069625088e-26</code></pre> +<div class="sourceCode" id="cb596"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb596-1" data-line-number="1">simplesinglecell_genes <-<span class="st"> </span><span class="kw">rownames</span>(top.dec)[top.dec<span class="op">$</span>FDR <span class="op"><</span><span class="st"> </span><span class="fl">0.001</span>]</a> +<a class="sourceLine" id="cb596-2" data-line-number="2"><span class="kw">table</span>(top.dec<span class="op">$</span>FDR <span class="op"><</span><span class="st"> </span><span class="fl">0.001</span>)</a></code></pre></div> +<pre><code>## +## FALSE TRUE +## 21124 1307</code></pre> +<p>If we set an FDR threshold of 0.1%, this approach identifies around 1300 highly +variable genes.</p> +<p>The output of this variance modelling can be used as input to a <code>denoisePCA()</code> +function to compute “denoised†principal components for clustering and other +downstream analyses (details not shown here; please see the <code>simpleSingleCell</code> +workflow).</p> </div> <div id="high-dropout-genes" class="section level4"> -<h4><span class="header-section-number">7.8.1.2</span> High Dropout Genes</h4> -<p>An alternative to finding HVGs is to identify genes with unexpectedly high numbers of zeros. -The frequency of zeros, known as the “dropout rateâ€, is very closely related to expression level -in scRNASeq data. Zeros are the dominant feature of single-cell RNASeq data, typically accounting -for over half of the entries in the final expression matrix. These zeros predominantly result -from the failure of mRNAs failing to be reversed transcribed <a href="http://www.biorxiv.org/content/early/2017/05/25/065094">(Andrews and Hemberg, 2016)</a>. Reverse transcription -is an enzyme reaction thus can be modelled using the Michaelis-Menten equation:</p> +<h4><span class="header-section-number">7.8.1.3</span> High Dropout Genes</h4> +<p>An alternative to finding HVGs is to identify genes with unexpectedly high +numbers of zeros. The frequency of zeros, known as the “dropout rateâ€, is very +closely related to expression level in scRNASeq data. Zeros are the dominant +feature of single-cell RNASeq data, typically accounting for over half of the +entries in the final expression matrix. These zeros predominantly result from +the failure of mRNAs failing to be reversed transcribed <a href="http://www.biorxiv.org/content/early/2017/05/25/065094">(Andrews and Hemberg, +2016)</a>. Reverse +transcription is an enzyme reaction thus can be modelled using the +Michaelis-Menten equation:</p> <p><span class="math display">\[P_{dropout} = 1 - S/(K + S)\]</span></p> -<p>where <span class="math inline">\(S\)</span> is the mRNA concentration in the cell (we will estimate this as average expression) -and <span class="math inline">\(K\)</span> is the Michaelis-Menten constant.</p> -<p>Because the Michaelis-Menten equation is a convex non-linear function, genes which are -differentially expression across two or more populations of cells in our dataset will -be shifted up/right of the Michaelis-Menten model (see Figure below).</p> -<div class="sourceCode" id="cb546"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb546-1" data-line-number="1">K <-<span class="st"> </span><span class="dv">49</span></a> -<a class="sourceLine" id="cb546-2" data-line-number="2">S_sim <-<span class="st"> </span><span class="dv">10</span><span class="op">^</span><span class="kw">seq</span>(<span class="dt">from =</span> <span class="dv">-3</span>, <span class="dt">to =</span> <span class="dv">4</span>, <span class="dt">by =</span> <span class="fl">0.05</span>) <span class="co"># range of expression values</span></a> -<a class="sourceLine" id="cb546-3" data-line-number="3">MM <-<span class="st"> </span><span class="dv">1</span> <span class="op">-</span><span class="st"> </span>S_sim <span class="op">/</span><span class="st"> </span>(K <span class="op">+</span><span class="st"> </span>S_sim)</a> -<a class="sourceLine" id="cb546-4" data-line-number="4"><span class="kw">plot</span>(</a> -<a class="sourceLine" id="cb546-5" data-line-number="5"> S_sim, </a> -<a class="sourceLine" id="cb546-6" data-line-number="6"> MM, </a> -<a class="sourceLine" id="cb546-7" data-line-number="7"> <span class="dt">type =</span> <span class="st">"l"</span>, </a> -<a class="sourceLine" id="cb546-8" data-line-number="8"> <span class="dt">lwd =</span> <span class="dv">3</span>, </a> -<a class="sourceLine" id="cb546-9" data-line-number="9"> <span class="dt">xlab =</span> <span class="st">"Expression"</span>, </a> -<a class="sourceLine" id="cb546-10" data-line-number="10"> <span class="dt">ylab =</span> <span class="st">"Dropout Rate"</span>, </a> -<a class="sourceLine" id="cb546-11" data-line-number="11"> <span class="dt">xlim =</span> <span class="kw">c</span>(<span class="dv">1</span>,<span class="dv">1000</span>)</a> -<a class="sourceLine" id="cb546-12" data-line-number="12">)</a> -<a class="sourceLine" id="cb546-13" data-line-number="13">S1 <-<span class="st"> </span><span class="dv">10</span> <span class="co"># Mean expression in population 1</span></a> -<a class="sourceLine" id="cb546-14" data-line-number="14">P1 <-<span class="st"> </span><span class="dv">1</span> <span class="op">-</span><span class="st"> </span>S1 <span class="op">/</span><span class="st"> </span>(K <span class="op">+</span><span class="st"> </span>S1) <span class="co"># Dropouts for cells in condition 1</span></a> -<a class="sourceLine" id="cb546-15" data-line-number="15">S2 <-<span class="st"> </span><span class="dv">750</span> <span class="co"># Mean expression in population 2</span></a> -<a class="sourceLine" id="cb546-16" data-line-number="16">P2 <-<span class="st"> </span><span class="dv">1</span> <span class="op">-</span><span class="st"> </span>S2 <span class="op">/</span><span class="st"> </span>(K <span class="op">+</span><span class="st"> </span>S2) <span class="co"># Dropouts for cells in condition 2</span></a> -<a class="sourceLine" id="cb546-17" data-line-number="17"><span class="kw">points</span>(</a> -<a class="sourceLine" id="cb546-18" data-line-number="18"> <span class="kw">c</span>(S1, S2),</a> -<a class="sourceLine" id="cb546-19" data-line-number="19"> <span class="kw">c</span>(P1, P2), </a> -<a class="sourceLine" id="cb546-20" data-line-number="20"> <span class="dt">pch =</span> <span class="dv">16</span>, </a> -<a class="sourceLine" id="cb546-21" data-line-number="21"> <span class="dt">col =</span> <span class="st">"grey85"</span>, </a> -<a class="sourceLine" id="cb546-22" data-line-number="22"> <span class="dt">cex =</span> <span class="dv">3</span></a> -<a class="sourceLine" id="cb546-23" data-line-number="23">)</a> -<a class="sourceLine" id="cb546-24" data-line-number="24">mix <-<span class="st"> </span><span class="fl">0.5</span> <span class="co"># proportion of cells in condition 1</span></a> -<a class="sourceLine" id="cb546-25" data-line-number="25"><span class="kw">points</span>(</a> -<a class="sourceLine" id="cb546-26" data-line-number="26"> S1 <span class="op">*</span><span class="st"> </span>mix <span class="op">+</span><span class="st"> </span>S2 <span class="op">*</span><span class="st"> </span>(<span class="dv">1</span> <span class="op">-</span><span class="st"> </span>mix), </a> -<a class="sourceLine" id="cb546-27" data-line-number="27"> P1 <span class="op">*</span><span class="st"> </span>mix <span class="op">+</span><span class="st"> </span>P2 <span class="op">*</span><span class="st"> </span>(<span class="dv">1</span> <span class="op">-</span><span class="st"> </span>mix), </a> -<a class="sourceLine" id="cb546-28" data-line-number="28"> <span class="dt">pch =</span> <span class="dv">16</span>, </a> -<a class="sourceLine" id="cb546-29" data-line-number="29"> <span class="dt">col =</span> <span class="st">"grey35"</span>, </a> -<a class="sourceLine" id="cb546-30" data-line-number="30"> <span class="dt">cex =</span> <span class="dv">3</span></a> -<a class="sourceLine" id="cb546-31" data-line-number="31">)</a></code></pre></div> -<p><strong>Note</strong>: add <code>log="x"</code> to the <code>plot</code> call above to see how this looks on the log scale, which is used in M3Drop figures.</p> -<p><strong>Exercise 4</strong>: Produce the same plot as above with different expression levels (S1 & S2) and/or mixtures (mix).</p> +<p>where <span class="math inline">\(S\)</span> is the mRNA concentration in the cell (we will estimate this as +average expression) and <span class="math inline">\(K\)</span> is the Michaelis-Menten constant.</p> +<p>Because the Michaelis-Menten equation is a convex non-linear function, genes +which are differentially expression across two or more populations of cells in +our dataset will be shifted up/right of the Michaelis-Menten model (see Figure +below).</p> +<div class="sourceCode" id="cb598"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb598-1" data-line-number="1">K <-<span class="st"> </span><span class="dv">49</span></a> +<a class="sourceLine" id="cb598-2" data-line-number="2">S_sim <-<span class="st"> </span><span class="dv">10</span><span class="op">^</span><span class="kw">seq</span>(<span class="dt">from =</span> <span class="dv">-3</span>, <span class="dt">to =</span> <span class="dv">4</span>, <span class="dt">by =</span> <span class="fl">0.05</span>) <span class="co"># range of expression values</span></a> +<a class="sourceLine" id="cb598-3" data-line-number="3">MM <-<span class="st"> </span><span class="dv">1</span> <span class="op">-</span><span class="st"> </span>S_sim <span class="op">/</span><span class="st"> </span>(K <span class="op">+</span><span class="st"> </span>S_sim)</a> +<a class="sourceLine" id="cb598-4" data-line-number="4"><span class="kw">plot</span>(</a> +<a class="sourceLine" id="cb598-5" data-line-number="5"> S_sim, </a> +<a class="sourceLine" id="cb598-6" data-line-number="6"> MM, </a> +<a class="sourceLine" id="cb598-7" data-line-number="7"> <span class="dt">type =</span> <span class="st">"l"</span>, </a> +<a class="sourceLine" id="cb598-8" data-line-number="8"> <span class="dt">lwd =</span> <span class="dv">3</span>, </a> +<a class="sourceLine" id="cb598-9" data-line-number="9"> <span class="dt">xlab =</span> <span class="st">"Expression"</span>, </a> +<a class="sourceLine" id="cb598-10" data-line-number="10"> <span class="dt">ylab =</span> <span class="st">"Dropout Rate"</span>, </a> +<a class="sourceLine" id="cb598-11" data-line-number="11"> <span class="dt">xlim =</span> <span class="kw">c</span>(<span class="dv">1</span>,<span class="dv">1000</span>)</a> +<a class="sourceLine" id="cb598-12" data-line-number="12">)</a> +<a class="sourceLine" id="cb598-13" data-line-number="13">S1 <-<span class="st"> </span><span class="dv">10</span> <span class="co"># Mean expression in population 1</span></a> +<a class="sourceLine" id="cb598-14" data-line-number="14">P1 <-<span class="st"> </span><span class="dv">1</span> <span class="op">-</span><span class="st"> </span>S1 <span class="op">/</span><span class="st"> </span>(K <span class="op">+</span><span class="st"> </span>S1) <span class="co"># Dropouts for cells in condition 1</span></a> +<a class="sourceLine" id="cb598-15" data-line-number="15">S2 <-<span class="st"> </span><span class="dv">750</span> <span class="co"># Mean expression in population 2</span></a> +<a class="sourceLine" id="cb598-16" data-line-number="16">P2 <-<span class="st"> </span><span class="dv">1</span> <span class="op">-</span><span class="st"> </span>S2 <span class="op">/</span><span class="st"> </span>(K <span class="op">+</span><span class="st"> </span>S2) <span class="co"># Dropouts for cells in condition 2</span></a> +<a class="sourceLine" id="cb598-17" data-line-number="17"><span class="kw">points</span>(</a> +<a class="sourceLine" id="cb598-18" data-line-number="18"> <span class="kw">c</span>(S1, S2),</a> +<a class="sourceLine" id="cb598-19" data-line-number="19"> <span class="kw">c</span>(P1, P2), </a> +<a class="sourceLine" id="cb598-20" data-line-number="20"> <span class="dt">pch =</span> <span class="dv">16</span>, </a> +<a class="sourceLine" id="cb598-21" data-line-number="21"> <span class="dt">col =</span> <span class="st">"grey85"</span>, </a> +<a class="sourceLine" id="cb598-22" data-line-number="22"> <span class="dt">cex =</span> <span class="dv">3</span></a> +<a class="sourceLine" id="cb598-23" data-line-number="23">)</a> +<a class="sourceLine" id="cb598-24" data-line-number="24">mix <-<span class="st"> </span><span class="fl">0.5</span> <span class="co"># proportion of cells in condition 1</span></a> +<a class="sourceLine" id="cb598-25" data-line-number="25"><span class="kw">points</span>(</a> +<a class="sourceLine" id="cb598-26" data-line-number="26"> S1 <span class="op">*</span><span class="st"> </span>mix <span class="op">+</span><span class="st"> </span>S2 <span class="op">*</span><span class="st"> </span>(<span class="dv">1</span> <span class="op">-</span><span class="st"> </span>mix), </a> +<a class="sourceLine" id="cb598-27" data-line-number="27"> P1 <span class="op">*</span><span class="st"> </span>mix <span class="op">+</span><span class="st"> </span>P2 <span class="op">*</span><span class="st"> </span>(<span class="dv">1</span> <span class="op">-</span><span class="st"> </span>mix), </a> +<a class="sourceLine" id="cb598-28" data-line-number="28"> <span class="dt">pch =</span> <span class="dv">16</span>, </a> +<a class="sourceLine" id="cb598-29" data-line-number="29"> <span class="dt">col =</span> <span class="st">"grey35"</span>, </a> +<a class="sourceLine" id="cb598-30" data-line-number="30"> <span class="dt">cex =</span> <span class="dv">3</span></a> +<a class="sourceLine" id="cb598-31" data-line-number="31">)</a></code></pre></div> +<p><img src="feature-selection_files/figure-html/unnamed-chunk-10-1.png" width="90%" style="display: block; margin: auto;" /></p> +<p><strong>Note</strong>: add <code>log="x"</code> to the <code>plot</code> call above to see how this looks on the +log scale, which is used in M3Drop figures.</p> +<p><strong>Exercise 4</strong>: Produce the same plot as above with different expression levels +(S1 & S2) and/or mixtures (mix).</p> <p>We use M3Drop to identify significant outliers to the right of the MM curve. We also apply 1% FDR multiple testing correction:</p> -<div class="sourceCode" id="cb547"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb547-1" data-line-number="1">M3Drop_genes <-<span class="st"> </span><span class="kw">M3DropFeatureSelection</span>(</a> -<a class="sourceLine" id="cb547-2" data-line-number="2"> expr_matrix,</a> -<a class="sourceLine" id="cb547-3" data-line-number="3"> <span class="dt">mt_method =</span> <span class="st">"fdr"</span>,</a> -<a class="sourceLine" id="cb547-4" data-line-number="4"> <span class="dt">mt_threshold =</span> <span class="fl">0.01</span></a> -<a class="sourceLine" id="cb547-5" data-line-number="5">)</a> -<a class="sourceLine" id="cb547-6" data-line-number="6">M3Drop_genes <-<span class="st"> </span>M3Drop_genes<span class="op">$</span>Gene</a></code></pre></div> +<div class="sourceCode" id="cb599"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb599-1" data-line-number="1">M3Drop_genes <-<span class="st"> </span><span class="kw">M3DropFeatureSelection</span>(</a> +<a class="sourceLine" id="cb599-2" data-line-number="2"> expr_matrix,</a> +<a class="sourceLine" id="cb599-3" data-line-number="3"> <span class="dt">mt_method =</span> <span class="st">"fdr"</span>,</a> +<a class="sourceLine" id="cb599-4" data-line-number="4"> <span class="dt">mt_threshold =</span> <span class="fl">0.01</span></a> +<a class="sourceLine" id="cb599-5" data-line-number="5">)</a></code></pre></div> +<p><img src="feature-selection_files/figure-html/unnamed-chunk-12-1.png" width="90%" style="display: block; margin: auto;" /></p> +<div class="sourceCode" id="cb600"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb600-1" data-line-number="1">M3Drop_genes <-<span class="st"> </span>M3Drop_genes<span class="op">$</span>Gene</a></code></pre></div> <p>An alternative method is contained in the M3Drop package that is tailored specifically for UMI-tagged data which generally contains many zeros resulting from low sequencing coverage in addition to those resulting from insufficient reverse-transcription. This model is the @@ -6032,70 +7144,427 @@ detection).</p> the Deng data is not UMI counts the model does not fit the noise sufficiently and far too many genes will be called as significant. Thus we will take the top 1500 by effect size.</p> -<div class="sourceCode" id="cb548"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb548-1" data-line-number="1">deng_int <-<span class="st"> </span><span class="kw">NBumiConvertData</span>(deng)</a> -<a class="sourceLine" id="cb548-2" data-line-number="2">DANB_fit <-<span class="st"> </span><span class="kw">NBumiFitModel</span>(deng_int) <span class="co"># DANB is fit to the raw count matrix</span></a> -<a class="sourceLine" id="cb548-3" data-line-number="3"><span class="co"># Perform DANB feature selection</span></a> -<a class="sourceLine" id="cb548-4" data-line-number="4">DropFS <-<span class="st"> </span><span class="kw">NBumiFeatureSelectionCombinedDrop</span>(DANB_fit, <span class="dt">method=</span><span class="st">"fdr"</span>, <span class="dt">qval.thresh=</span><span class="fl">0.01</span>, <span class="dt">suppress.plot=</span><span class="ot">FALSE</span>)</a> -<a class="sourceLine" id="cb548-5" data-line-number="5">DANB_genes <-<span class="st"> </span>DropFS[<span class="dv">1</span><span class="op">:</span><span class="dv">1500</span>,]<span class="op">$</span>Gene</a></code></pre></div> +<div class="sourceCode" id="cb601"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb601-1" data-line-number="1">deng_int <-<span class="st"> </span><span class="kw">NBumiConvertData</span>(deng)</a></code></pre></div> +<pre><code>## [1] "Removing 1134 undetected genes."</code></pre> +<div class="sourceCode" id="cb603"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb603-1" data-line-number="1">DANB_fit <-<span class="st"> </span><span class="kw">NBumiFitModel</span>(deng_int) <span class="co"># DANB is fit to the raw count matrix</span></a> +<a class="sourceLine" id="cb603-2" data-line-number="2"><span class="co"># Perform DANB feature selection</span></a> +<a class="sourceLine" id="cb603-3" data-line-number="3">DropFS <-<span class="st"> </span><span class="kw">NBumiFeatureSelectionCombinedDrop</span>(DANB_fit, <span class="dt">method=</span><span class="st">"fdr"</span>, <span class="dt">qval.thresh=</span><span class="fl">0.01</span>, <span class="dt">suppress.plot=</span><span class="ot">FALSE</span>)</a></code></pre></div> +<p><img src="feature-selection_files/figure-html/unnamed-chunk-13-1.png" width="90%" style="display: block; margin: auto;" /></p> +<div class="sourceCode" id="cb604"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb604-1" data-line-number="1">DANB_genes <-<span class="st"> </span>DropFS[<span class="dv">1</span><span class="op">:</span><span class="dv">1500</span>,]<span class="op">$</span>Gene</a></code></pre></div> <p><strong>Exercise 5</strong> How many genes were signifcant using NBumiFeatureSelectionCombinedDrop?</p> +<pre><code>## [1] 10694</code></pre> +</div> +<div id="residual-variance-from-a-regularized-negative-binomial-model" class="section level4"> +<h4><span class="header-section-number">7.8.1.4</span> Residual variance from a (regularized) negative binomial model</h4> +<p>In the <a href="normalization-confounders-and-batch-correction.html#normalization-theory">normalization chapter</a> we introduced the +<code>sctransform</code> approach to using Pearson residuals from an regularized negative +binomial generalized linear model to normalize scRNA-seq data.</p> +<p>The residual variance of genes (i.e. the variance of the Pearson residuals) +provides a way to identify highly variable genes, where the “variance†is +decoupled from the average level of expression of the gene.</p> +<p>The residual variance is easily accessible from the <code>sctransform</code> output as we +show below.</p> +<p>First, we run <code>sctransform</code> as we did previously.</p> +<div class="sourceCode" id="cb606"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb606-1" data-line-number="1">deng_sparse <-<span class="st"> </span><span class="kw">as</span>(<span class="kw">counts</span>(deng), <span class="st">"dgCMatrix"</span>)</a> +<a class="sourceLine" id="cb606-2" data-line-number="2"><span class="co">### Genes expressed in at least 5 cells will be kept</span></a> +<a class="sourceLine" id="cb606-3" data-line-number="3">sctnorm_data <-<span class="st"> </span>sctransform<span class="op">::</span><span class="kw">vst</span>(<span class="dt">umi =</span> deng_sparse, <span class="dt">min_cells =</span> <span class="dv">1</span>,</a> +<a class="sourceLine" id="cb606-4" data-line-number="4"> <span class="dt">cell_attr =</span> <span class="kw">as.data.frame</span>(<span class="kw">colData</span>(deng)),</a> +<a class="sourceLine" id="cb606-5" data-line-number="5"> <span class="dt">latent_var =</span> <span class="st">"log10_total_counts_endogenous"</span>)</a></code></pre></div> +<pre><code>## + | + | | 0% + | + |======== | 12% + | + |================ | 25% + | + |======================== | 38% + | + |================================ | 50% + | + |========================================= | 62% + | + |================================================= | 75% + | + |========================================================= | 88% + | + |=================================================================| 100% +## + | + | | 0% + | + |= | 1% + | + |== | 2% + | + |== | 4% + | + |=== | 5% + | + |==== | 6% + | + |===== | 7% + | + |===== | 8% + | + |====== | 10% + | + |======= | 11% + | + |======== | 12% + | + |========= | 13% + | + |========= | 14% + | + |========== | 15% + | + |=========== | 17% + | + |============ | 18% + | + |============ | 19% + | + |============= | 20% + | + |============== | 21% + | + |=============== | 23% + | + |=============== | 24% + | + |================ | 25% + | + |================= | 26% + | + |================== | 27% + | + |=================== | 29% + | + |=================== | 30% + | + |==================== | 31% + | + |===================== | 32% + | + |====================== | 33% + | + |====================== | 35% + | + |======================= | 36% + | + |======================== | 37% + | + |========================= | 38% + | + |========================== | 39% + | + |========================== | 40% + | + |=========================== | 42% + | + |============================ | 43% + | + |============================= | 44% + | + |============================= | 45% + | + |============================== | 46% + | + |=============================== | 48% + | + |================================ | 49% + | + |================================ | 50% + | + |================================= | 51% + | + |================================== | 52% + | + |=================================== | 54% + | + |==================================== | 55% + | + |==================================== | 56% + | + |===================================== | 57% + | + |====================================== | 58% + | + |======================================= | 60% + | + |======================================= | 61% + | + |======================================== | 62% + | + |========================================= | 63% + | + |========================================== | 64% + | + |=========================================== | 65% + | + |=========================================== | 67% + | + |============================================ | 68% + | + |============================================= | 69% + | + |============================================== | 70% + | + |============================================== | 71% + | + |=============================================== | 73% + | + |================================================ | 74% + | + |================================================= | 75% + | + |================================================== | 76% + | + |================================================== | 77% + | + |=================================================== | 79% + | + |==================================================== | 80% + | + |===================================================== | 81% + | + |===================================================== | 82% + | + |====================================================== | 83% + | + |======================================================= | 85% + | + |======================================================== | 86% + | + |======================================================== | 87% + | + |========================================================= | 88% + | + |========================================================== | 89% + | + |=========================================================== | 90% + | + |============================================================ | 92% + | + |============================================================ | 93% + | + |============================================================= | 94% + | + |============================================================== | 95% + | + |=============================================================== | 96% + | + |=============================================================== | 98% + | + |================================================================ | 99% + | + |=================================================================| 100%</code></pre> +<div class="sourceCode" id="cb608"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb608-1" data-line-number="1">sctnorm_data<span class="op">$</span>model_str</a></code></pre></div> +<pre><code>## [1] "y ~ log10_total_counts_endogenous"</code></pre> +<div class="sourceCode" id="cb610"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb610-1" data-line-number="1"><span class="kw">library</span>(ggplot2)</a> +<a class="sourceLine" id="cb610-2" data-line-number="2"><span class="kw">ggplot</span>(sctnorm_data<span class="op">$</span>gene_attr, <span class="kw">aes</span>(residual_variance)) <span class="op">+</span></a> +<a class="sourceLine" id="cb610-3" data-line-number="3"><span class="st"> </span><span class="kw">geom_histogram</span>(<span class="dt">binwidth=</span><span class="fl">0.1</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb610-4" data-line-number="4"><span class="st"> </span><span class="kw">geom_vline</span>(<span class="dt">xintercept=</span><span class="dv">1</span>, <span class="dt">color=</span><span class="st">'red'</span>) <span class="op">+</span><span class="st"> </span><span class="kw">xlim</span>(<span class="dv">0</span>, <span class="dv">10</span>)</a></code></pre></div> +<p><img src="feature-selection_files/figure-html/sctransform-feature-select-1.png" width="90%" style="display: block; margin: auto;" /></p> +<div class="sourceCode" id="cb611"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb611-1" data-line-number="1">sctnorm_data<span class="op">$</span>gene_attr<span class="op">$</span>label <-<span class="st"> </span><span class="kw">rownames</span>(sctnorm_data<span class="op">$</span>gene_attr)</a> +<a class="sourceLine" id="cb611-2" data-line-number="2"><span class="kw">ggplot</span>(sctnorm_data<span class="op">$</span>gene_attr, <span class="kw">aes</span>(<span class="dt">x =</span> gmean, <span class="dt">y=</span>residual_variance)) <span class="op">+</span></a> +<a class="sourceLine" id="cb611-3" data-line-number="3"><span class="st"> </span><span class="kw">geom_point</span>(<span class="dt">alpha =</span> <span class="fl">0.6</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb611-4" data-line-number="4"><span class="st"> </span><span class="kw">geom_point</span>(<span class="dt">colour =</span> <span class="st">"firebrick2"</span>,</a> +<a class="sourceLine" id="cb611-5" data-line-number="5"> <span class="dt">data =</span> sctnorm_data<span class="op">$</span>gene_attr[sctnorm_data<span class="op">$</span>gene_attr<span class="op">$</span>residual_variance <span class="op">></span><span class="st"> </span><span class="dv">3</span>,]) <span class="op">+</span></a> +<a class="sourceLine" id="cb611-6" data-line-number="6"><span class="st"> </span><span class="kw">scale_x_log10</span>() <span class="op">+</span></a> +<a class="sourceLine" id="cb611-7" data-line-number="7"><span class="st"> </span><span class="kw">geom_hline</span>(<span class="dt">yintercept =</span> <span class="dv">1</span>, <span class="dt">size =</span> <span class="dv">3</span>, <span class="dt">color =</span> <span class="st">"dodgerblue"</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb611-8" data-line-number="8"><span class="st"> </span><span class="kw">geom_label</span>(<span class="kw">aes</span>(<span class="dt">label =</span> label),</a> +<a class="sourceLine" id="cb611-9" data-line-number="9"> <span class="dt">data =</span> sctnorm_data<span class="op">$</span>gene_attr[sctnorm_data<span class="op">$</span>gene_attr<span class="op">$</span>residual_variance <span class="op">></span><span class="st"> </span><span class="dv">30</span>,]) <span class="op">+</span></a> +<a class="sourceLine" id="cb611-10" data-line-number="10"><span class="st"> </span><span class="kw">theme_bw</span>()</a></code></pre></div> +<p><img src="feature-selection_files/figure-html/sctransform-feature-select-2.png" width="90%" style="display: block; margin: auto;" /></p> +<div class="sourceCode" id="cb612"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb612-1" data-line-number="1">sct_genes <-<span class="st"> </span><span class="kw">rownames</span>(sctnorm_data<span class="op">$</span>gene_attr)[sctnorm_data<span class="op">$</span>gene_attr<span class="op">$</span>residual_variance <span class="op">></span><span class="st"> </span><span class="dv">4</span>]</a> +<a class="sourceLine" id="cb612-2" data-line-number="2"><span class="kw">table</span>(sctnorm_data<span class="op">$</span>gene_attr<span class="op">$</span>residual_variance <span class="op">></span><span class="st"> </span><span class="dv">4</span>)</a></code></pre></div> +<pre><code>## +## FALSE TRUE +## 20077 1220</code></pre> +<p>If we set a (relatively arbitrary) threshold of a residual variance greater than +three marking a “highly variable geneâ€, then we identify around 2000 highly +variable genes with this <code>sctransform</code> approach.</p> +<p>[NB: the <code>deng</code> data is extremely high depth for scRNA-seq data, so not the most +applicable dataset for <code>sctransform</code>, but we include this analysis here to +demonstrate the method rather than make any evaluation of its performance in +general.]</p> +<p>Although not explored here, the <em>deviance</em> statistic from the regularized NB GLM +fit provides a natural way to select informative features for downstream +analyses.</p> +<p>The <a href="https://en.wikipedia.org/wiki/Deviance_(statistics)">deviance</a> is a +goodness-of-fit statistic for a statistical model. As Wikipedia notes, deviance +is a generalization of the idea of using the sum of squares of residuals in +ordinary least squares to cases where model-fitting is achieved by maximum +likelihood. It plays an important role in exponential dispersion models and +generalized linear models, such as the negative binomial model.</p> +<p>However, <code>sctransform</code> does not seem set up to use the model deviance to select +informative features, but we expect this could be a direction the field goes in +the near future. Keep an eye out!</p> </div> </div> <div id="correlated-expression" class="section level3"> <h3><span class="header-section-number">7.8.2</span> Correlated Expression</h3> -<p>A completely different approach to feature selection is to use gene-gene correlations. This method -is based on the idea that multiple genes will be differentially expressed between different cell-types -or cell-states. Genes which are expressed in the same cell-population will be positively correlated -with each other where as genes expressed in different cell-populations will be negatively correated with -each other. Thus important genes can be identified by the magnitude of their correlation -with other genes.</p> -<p>The limitation of this method is that it assumes technical noise is random and independent for each cell, -thus shouldn’t produce gene-gene correlations, but this assumption is violated by batch effects which are -generally systematic between different experimental batches and will produce gene-gene correlations. As a -result it is more appropriate to take the top few thousand genes as ranked by gene-gene correlation than -consider the significance of the correlations.</p> -<div class="sourceCode" id="cb549"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb549-1" data-line-number="1">cor_feat <-<span class="st"> </span>M3Drop<span class="op">::</span><span class="kw">corFS</span>(expr_matrix)</a> -<a class="sourceLine" id="cb549-2" data-line-number="2">Cor_genes <-<span class="st"> </span><span class="kw">names</span>(cor_feat)[<span class="dv">1</span><span class="op">:</span><span class="dv">1500</span>]</a></code></pre></div> -<p>Lastly, another common method for feature selection in scRNASeq data is to use PCA loadings. Genes with -high PCA loadings are likely to be highly variable and correlated with many other variable genes, thus -may be relevant to the underlying biology. However, as with gene-gene correlations PCA loadings tend to -be susceptible to detecting systematic variation due to batch effects; thus it is recommended to plot the PCA -results to determine those components corresponding to the biological variation rather than batch effects.</p> -<div class="sourceCode" id="cb550"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb550-1" data-line-number="1"><span class="co"># PCA is typically performed on log-transformed expression data</span></a> -<a class="sourceLine" id="cb550-2" data-line-number="2">pca <-<span class="st"> </span><span class="kw">prcomp</span>(<span class="kw">log</span>(expr_matrix <span class="op">+</span><span class="st"> </span><span class="dv">1</span>) <span class="op">/</span><span class="st"> </span><span class="kw">log</span>(<span class="dv">2</span>))</a> -<a class="sourceLine" id="cb550-3" data-line-number="3"></a> -<a class="sourceLine" id="cb550-4" data-line-number="4"><span class="co"># plot projection</span></a> -<a class="sourceLine" id="cb550-5" data-line-number="5"><span class="kw">plot</span>(</a> -<a class="sourceLine" id="cb550-6" data-line-number="6"> pca<span class="op">$</span>rotation[,<span class="dv">1</span>], </a> -<a class="sourceLine" id="cb550-7" data-line-number="7"> pca<span class="op">$</span>rotation[,<span class="dv">2</span>], </a> -<a class="sourceLine" id="cb550-8" data-line-number="8"> <span class="dt">pch =</span> <span class="dv">16</span>, </a> -<a class="sourceLine" id="cb550-9" data-line-number="9"> <span class="dt">col =</span> cell_colors[<span class="kw">as.factor</span>(celltype_labs)]</a> -<a class="sourceLine" id="cb550-10" data-line-number="10">) </a> -<a class="sourceLine" id="cb550-11" data-line-number="11"><span class="co"># calculate loadings for components 1 and 2</span></a> -<a class="sourceLine" id="cb550-12" data-line-number="12">score <-<span class="st"> </span><span class="kw">rowSums</span>(<span class="kw">abs</span>(pca<span class="op">$</span>x[,<span class="kw">c</span>(<span class="dv">1</span>,<span class="dv">2</span>)])) </a> -<a class="sourceLine" id="cb550-13" data-line-number="13"><span class="kw">names</span>(score) <-<span class="st"> </span><span class="kw">rownames</span>(expr_matrix)</a> -<a class="sourceLine" id="cb550-14" data-line-number="14">score <-<span class="st"> </span>score[<span class="kw">order</span>(<span class="op">-</span>score)]</a> -<a class="sourceLine" id="cb550-15" data-line-number="15">PCA_genes <-<span class="st"> </span><span class="kw">names</span>(score[<span class="dv">1</span><span class="op">:</span><span class="dv">1500</span>])</a></code></pre></div> -<p><strong>Exercise 6</strong> -Consider the top 5 principal components. Which appear to be most biologically relevant? How does the top 1,500 -features change if you consider the loadings for those components?</p> +<p>A completely different approach to feature selection is to use gene-gene +correlations. This method is based on the idea that multiple genes will be +differentially expressed between different cell-types or cell-states. Genes +which are expressed in the same cell-population will be positively correlated +with each other where as genes expressed in different cell-populations will be +negatively correated with each other. Thus important genes can be identified by +the magnitude of their correlation with other genes.</p> +<p>The limitation of this method is that it assumes technical noise is random and +independent for each cell, thus shouldn’t produce gene-gene correlations, but +this assumption is violated by batch effects which are generally systematic +between different experimental batches and will produce gene-gene correlations. +As a result it is more appropriate to take the top few thousand genes as ranked +by gene-gene correlation than consider the significance of the correlations.</p> +<div class="sourceCode" id="cb614"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb614-1" data-line-number="1">cor_feat <-<span class="st"> </span>M3Drop<span class="op">::</span><span class="kw">corFS</span>(expr_matrix)</a> +<a class="sourceLine" id="cb614-2" data-line-number="2">Cor_genes <-<span class="st"> </span><span class="kw">names</span>(cor_feat)[<span class="dv">1</span><span class="op">:</span><span class="dv">1500</span>]</a></code></pre></div> </div> <div id="comparing-methods" class="section level3"> <h3><span class="header-section-number">7.8.3</span> Comparing Methods</h3> -<p>We can check whether the identified features really do represent genes differentially expressed between -cell-types in this dataset.</p> -<div class="sourceCode" id="cb551"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb551-1" data-line-number="1"><span class="kw">M3DropExpressionHeatmap</span>(</a> -<a class="sourceLine" id="cb551-2" data-line-number="2"> M3Drop_genes,</a> -<a class="sourceLine" id="cb551-3" data-line-number="3"> expr_matrix,</a> -<a class="sourceLine" id="cb551-4" data-line-number="4"> <span class="dt">cell_labels =</span> celltype_labs</a> -<a class="sourceLine" id="cb551-5" data-line-number="5">)</a></code></pre></div> +<p>We can check whether the identified features really do represent genes +differentially expressed between cell-types in this dataset.</p> +<div class="sourceCode" id="cb615"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb615-1" data-line-number="1"><span class="kw">M3DropExpressionHeatmap</span>(</a> +<a class="sourceLine" id="cb615-2" data-line-number="2"> M3Drop_genes,</a> +<a class="sourceLine" id="cb615-3" data-line-number="3"> expr_matrix,</a> +<a class="sourceLine" id="cb615-4" data-line-number="4"> <span class="dt">cell_labels =</span> celltype_labs</a> +<a class="sourceLine" id="cb615-5" data-line-number="5">)</a></code></pre></div> +<p><img src="feature-selection_files/figure-html/unnamed-chunk-16-1.png" width="90%" style="display: block; margin: auto;" /></p> <p>We can also consider how consistent each feature selection method is with the others using the Jaccard Index:</p> -<div class="sourceCode" id="cb552"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb552-1" data-line-number="1">J <-<span class="st"> </span><span class="kw">sum</span>(M3Drop_genes <span class="op">%in%</span><span class="st"> </span>HVG_genes)<span class="op">/</span><span class="kw">length</span>(<span class="kw">unique</span>(<span class="kw">c</span>(M3Drop_genes, HVG_genes)))</a></code></pre></div> -<p><strong>Exercise 7</strong></p> -<p>Plot the expression of the features for each of the other methods. Which appear to be differentially expressed? How consistent are the different methods for this dataset?</p> +<div class="sourceCode" id="cb616"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb616-1" data-line-number="1">J <-<span class="st"> </span><span class="kw">sum</span>(M3Drop_genes <span class="op">%in%</span><span class="st"> </span>HVG_genes)<span class="op">/</span><span class="kw">length</span>(<span class="kw">unique</span>(<span class="kw">c</span>(M3Drop_genes, HVG_genes)))</a></code></pre></div> +<p><strong>Exercise 6</strong></p> +<p>Plot the expression of the features for each of the other methods. Which appear +to be differentially expressed? How consistent are the different methods for +this dataset?</p> +<div class="sourceCode" id="cb617"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb617-1" data-line-number="1"><span class="kw">M3DropExpressionHeatmap</span>(</a> +<a class="sourceLine" id="cb617-2" data-line-number="2"> DANB_genes,</a> +<a class="sourceLine" id="cb617-3" data-line-number="3"> expr_matrix,</a> +<a class="sourceLine" id="cb617-4" data-line-number="4"> <span class="dt">cell_labels =</span> celltype_labs</a> +<a class="sourceLine" id="cb617-5" data-line-number="5">)</a></code></pre></div> +<p><img src="feature-selection_files/figure-html/unnamed-chunk-18-1.png" width="90%" style="display: block; margin: auto;" /></p> +<p>Jaccard index comparison of sets of informative features:</p> +<div class="sourceCode" id="cb618"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb618-1" data-line-number="1">list_of_features <-<span class="st"> </span><span class="kw">list</span>(</a> +<a class="sourceLine" id="cb618-2" data-line-number="2"> M3Drop_genes,</a> +<a class="sourceLine" id="cb618-3" data-line-number="3"> DANB_genes,</a> +<a class="sourceLine" id="cb618-4" data-line-number="4"> HVG_genes, </a> +<a class="sourceLine" id="cb618-5" data-line-number="5"> simplesinglecell_genes, </a> +<a class="sourceLine" id="cb618-6" data-line-number="6"> sct_genes</a> +<a class="sourceLine" id="cb618-7" data-line-number="7">)</a> +<a class="sourceLine" id="cb618-8" data-line-number="8">Out <-<span class="st"> </span><span class="kw">matrix</span>(</a> +<a class="sourceLine" id="cb618-9" data-line-number="9"> <span class="dv">0</span>, </a> +<a class="sourceLine" id="cb618-10" data-line-number="10"> <span class="dt">ncol =</span> <span class="kw">length</span>(list_of_features), </a> +<a class="sourceLine" id="cb618-11" data-line-number="11"> <span class="dt">nrow =</span> <span class="kw">length</span>(list_of_features)</a> +<a class="sourceLine" id="cb618-12" data-line-number="12">)</a> +<a class="sourceLine" id="cb618-13" data-line-number="13"><span class="cf">for</span>(i <span class="cf">in</span> <span class="dv">1</span><span class="op">:</span><span class="kw">length</span>(list_of_features) ) {</a> +<a class="sourceLine" id="cb618-14" data-line-number="14"> <span class="cf">for</span>(j <span class="cf">in</span> <span class="dv">1</span><span class="op">:</span><span class="kw">length</span>(list_of_features) ) {</a> +<a class="sourceLine" id="cb618-15" data-line-number="15"> Out[i,j] <-<span class="st"> </span><span class="kw">sum</span>(list_of_features[[i]] <span class="op">%in%</span><span class="st"> </span>list_of_features[[j]])<span class="op">/</span></a> +<a class="sourceLine" id="cb618-16" data-line-number="16"><span class="st"> </span><span class="kw">length</span>(<span class="kw">unique</span>(<span class="kw">c</span>(list_of_features[[i]], list_of_features[[j]])))</a> +<a class="sourceLine" id="cb618-17" data-line-number="17"> }</a> +<a class="sourceLine" id="cb618-18" data-line-number="18">}</a> +<a class="sourceLine" id="cb618-19" data-line-number="19"><span class="kw">colnames</span>(Out) <-<span class="st"> </span><span class="kw">rownames</span>(Out) <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"M3Drop"</span>, <span class="st">"DANB"</span>, <span class="st">"Brennecke"</span>, <span class="st">"simpleSingleCell"</span>, <span class="st">"sctransform"</span>)</a> +<a class="sourceLine" id="cb618-20" data-line-number="20">Out</a></code></pre></div> +<pre><code>## M3Drop DANB Brennecke simpleSingleCell +## M3Drop 1.0000000 0.38019061 0.4152905 0.14615908 +## DANB 0.3801906 1.00000000 0.2283346 0.09868187 +## Brennecke 0.4152905 0.22833459 1.0000000 0.15019157 +## simpleSingleCell 0.1461591 0.09868187 0.1501916 1.00000000 +## sctransform 0.2343257 0.21801471 0.2718985 0.26034913 +## sctransform +## M3Drop 0.2343257 +## DANB 0.2180147 +## Brennecke 0.2718985 +## simpleSingleCell 0.2603491 +## sctransform 1.0000000</code></pre> </div> <div id="sessioninfo-5" class="section level3"> <h3><span class="header-section-number">7.8.4</span> sessionInfo()</h3> +<pre><code>## R version 3.6.0 (2019-04-26) +## Platform: x86_64-pc-linux-gnu (64-bit) +## Running under: Ubuntu 18.04.3 LTS +## +## Matrix products: default +## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 +## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 +## +## locale: +## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C +## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 +## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 +## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C +## [9] LC_ADDRESS=C LC_TELEPHONE=C +## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C +## +## attached base packages: +## [1] parallel stats4 stats graphics grDevices utils datasets +## [8] methods base +## +## other attached packages: +## [1] scran_1.12.1 scater_1.12.2 +## [3] ggplot2_3.2.1 Polychrome_1.2.3 +## [5] SingleCellExperiment_1.6.0 SummarizedExperiment_1.14.1 +## [7] DelayedArray_0.10.0 BiocParallel_1.18.1 +## [9] Biobase_2.44.0 GenomicRanges_1.36.1 +## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 +## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 +## [15] RColorBrewer_1.1-2 M3Drop_1.10.0 +## [17] numDeriv_2016.8-1.1 matrixStats_0.55.0 +## [19] scRNA.seq.funcs_0.1.0 +## +## loaded via a namespace (and not attached): +## [1] Rtsne_0.15 ggbeeswarm_0.6.0 +## [3] colorspace_1.4-1 dynamicTreeCut_1.63-1 +## [5] htmlTable_1.13.2 XVector_0.24.0 +## [7] base64enc_0.1-3 BiocNeighbors_1.2.0 +## [9] rstudioapi_0.10 listenv_0.7.0 +## [11] codetools_0.2-16 splines_3.6.0 +## [13] knitr_1.25 Formula_1.2-3 +## [15] cluster_2.1.0 sctransform_0.2.0 +## [17] compiler_3.6.0 dqrng_0.2.1 +## [19] backports_1.1.4 assertthat_0.2.1 +## [21] Matrix_1.2-17 lazyeval_0.2.2 +## [23] limma_3.40.6 BiocSingular_1.0.0 +## [25] acepack_1.4.1 htmltools_0.3.6 +## [27] tools_3.6.0 rsvd_1.0.2 +## [29] igraph_1.2.4.1 gtable_0.3.0 +## [31] glue_1.3.1 GenomeInfoDbData_1.2.1 +## [33] reshape2_1.4.3 dplyr_0.8.3 +## [35] Rcpp_1.0.2 bbmle_1.0.20 +## [37] gdata_2.18.0 nlme_3.1-139 +## [39] DelayedMatrixStats_1.6.1 xfun_0.9 +## [41] stringr_1.4.0 globals_0.12.4 +## [43] irlba_2.3.3 gtools_3.8.1 +## [45] hypergeo_1.2-13 statmod_1.4.32 +## [47] future_1.14.0 edgeR_3.26.8 +## [49] zlibbioc_1.30.0 MASS_7.3-51.1 +## [51] scales_1.0.0 yaml_2.2.0 +## [53] gridExtra_2.3 rpart_4.1-15 +## [55] latticeExtra_0.6-28 stringi_1.4.3 +## [57] checkmate_1.9.4 orthopolynom_1.0-5 +## [59] contfrac_1.1-12 caTools_1.17.1.2 +## [61] rlang_0.4.0 pkgconfig_2.0.3 +## [63] moments_0.14 bitops_1.0-6 +## [65] evaluate_0.14 lattice_0.20-38 +## [67] purrr_0.3.2 htmlwidgets_1.3 +## [69] labeling_0.3 cowplot_1.0.0 +## [71] tidyselect_0.2.5 deSolve_1.24 +## [73] plyr_1.8.4 magrittr_1.5 +## [75] bookdown_0.13 R6_2.4.0 +## [77] gplots_3.0.1.1 Hmisc_4.2-0 +## [79] pillar_1.4.2 foreign_0.8-70 +## [81] withr_2.1.2 mgcv_1.8-28 +## [83] survival_2.43-3 scatterplot3d_0.3-41 +## [85] RCurl_1.95-4.12 nnet_7.3-12 +## [87] future.apply_1.3.0 tibble_2.1.3 +## [89] crayon_1.3.4 KernSmooth_2.23-15 +## [91] rmarkdown_1.15 viridis_0.5.1 +## [93] locfit_1.5-9.1 grid_3.6.0 +## [95] data.table_1.12.2 reldist_1.6-6 +## [97] digest_0.6.21 elliptic_1.4-0 +## [99] munsell_0.5.0 beeswarm_0.2.3 +## [101] viridisLite_0.3.0 vipor_0.4.5</code></pre> </div> </div> diff --git a/public/processing-raw-scrna-seq-data.html b/public/processing-raw-scrna-seq-data.html index f1aa56a5fe0ed7c56b9100ec09fee8f1b079e04b..a58fa4780ab488b994cb9c257cfe637685b7d215 100644 --- a/public/processing-raw-scrna-seq-data.html +++ b/public/processing-raw-scrna-seq-data.html @@ -339,7 +339,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.4.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-3"><i class="fa fa-check"></i><b>7.4.6</b> sessionInfo()</a></li> </ul></li> <li class="chapter" data-level="7.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#identifying-confounding-factors-reads"><i class="fa fa-check"></i><b>7.5</b> Identifying confounding factors (Reads)</a></li> -<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#dealing-with-confounders"><i class="fa fa-check"></i><b>7.6</b> Dealing with confounders</a><ul> +<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#batch-effects"><i class="fa fa-check"></i><b>7.6</b> Batch effects</a><ul> <li class="chapter" data-level="7.6.1" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#introduction-6"><i class="fa fa-check"></i><b>7.6.1</b> Introduction</a></li> <li class="chapter" data-level="7.6.2" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#linear-models"><i class="fa fa-check"></i><b>7.6.2</b> Linear models</a></li> <li class="chapter" data-level="7.6.3" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sctransform-2"><i class="fa fa-check"></i><b>7.6.3</b> sctransform</a></li> @@ -347,7 +347,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.6.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#combat"><i class="fa fa-check"></i><b>7.6.5</b> Combat</a></li> <li class="chapter" data-level="7.6.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#mnncorrect"><i class="fa fa-check"></i><b>7.6.6</b> mnnCorrect</a></li> <li class="chapter" data-level="7.6.7" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#harmony"><i class="fa fa-check"></i><b>7.6.7</b> Harmony</a></li> -<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-confounder-removal-strategies"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare confounder removal strategies</a></li> +<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-batch-correction"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare batch correction</a></li> <li class="chapter" data-level="7.6.9" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#big-exercise-2"><i class="fa fa-check"></i><b>7.6.9</b> Big Exercise</a></li> <li class="chapter" data-level="7.6.10" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-4"><i class="fa fa-check"></i><b>7.6.10</b> sessionInfo()</a></li> </ul></li> @@ -370,14 +370,13 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="9.1.2" data-path="latent-spaces.html"><a href="latent-spaces.html#tsne-t-distributed-stochastic-neighbor-embedding"><i class="fa fa-check"></i><b>9.1.2</b> tSNE: t-Distributed Stochastic Neighbor Embedding</a></li> <li class="chapter" data-level="9.1.3" data-path="latent-spaces.html"><a href="latent-spaces.html#manifold-methods"><i class="fa fa-check"></i><b>9.1.3</b> Manifold methods</a></li> </ul></li> -<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a></li> +<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a><ul> +<li class="chapter" data-level="9.2.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom-interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.2.1</b> <span>Slalom</span>: Interpretable latent spaces</a></li> +</ul></li> <li class="chapter" data-level="9.3" data-path="latent-spaces.html"><a href="latent-spaces.html#autoencoders"><i class="fa fa-check"></i><b>9.3</b> Autoencoders</a><ul> <li class="chapter" data-level="9.3.1" data-path="latent-spaces.html"><a href="latent-spaces.html#background-and-some-notations"><i class="fa fa-check"></i><b>9.3.1</b> Background and some notations</a></li> <li class="chapter" data-level="9.3.2" data-path="latent-spaces.html"><a href="latent-spaces.html#objective"><i class="fa fa-check"></i><b>9.3.2</b> Objective</a></li> </ul></li> -<li class="chapter" data-level="9.4" data-path="latent-spaces.html"><a href="latent-spaces.html#interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.4</b> Interpretable latent spaces</a><ul> -<li class="chapter" data-level="9.4.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom"><i class="fa fa-check"></i><b>9.4.1</b> Slalom</a></li> -</ul></li> </ul></li> <li class="chapter" data-level="10" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html"><i class="fa fa-check"></i><b>10</b> Clustering and cell annotation</a><ul> <li class="chapter" data-level="10.1" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html#clustering-methods"><i class="fa fa-check"></i><b>10.1</b> Clustering Methods</a><ul> @@ -399,15 +398,16 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="11.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#first-look-at-deng-data"><i class="fa fa-check"></i><b>11.1</b> First look at Deng data</a><ul> <li class="chapter" data-level="11.1.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#tscan"><i class="fa fa-check"></i><b>11.1.1</b> TSCAN</a></li> <li class="chapter" data-level="11.1.2" data-path="trajectory-inference.html"><a href="trajectory-inference.html#slingshot"><i class="fa fa-check"></i><b>11.1.2</b> Slingshot</a></li> -<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.3</b> Monocle</a></li> -<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.4</b> Monocle 2</a></li> -<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 3</a></li> -<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.6</b> Diffusion maps</a></li> -<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.7</b> Other methods</a></li> -<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.8</b> Comparison of the methods</a></li> -<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.9</b> Expression of genes through time</a></li> -<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.10</b> dynverse</a></li> -<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.11</b> sessionInfo()</a></li> +<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#gam-general-additive-model-for-identifying-temporally-expressed-genes"><i class="fa fa-check"></i><b>11.1.3</b> GAM general additive model for identifying temporally expressed genes</a></li> +<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.4</b> Monocle</a></li> +<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 2</a></li> +<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.6</b> Monocle 3</a></li> +<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.7</b> Diffusion maps</a></li> +<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.8</b> Other methods</a></li> +<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.9</b> Comparison of the methods</a></li> +<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.10</b> Expression of genes through time</a></li> +<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.11</b> dynverse</a></li> +<li class="chapter" data-level="11.1.12" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.12</b> sessionInfo()</a></li> </ul></li> </ul></li> <li class="chapter" data-level="12" data-path="dechapter.html"><a href="dechapter.html"><i class="fa fa-check"></i><b>12</b> Differential Expression (DE) analysis</a><ul> @@ -1057,10 +1057,12 @@ with fewer than 10 total molecules.</p> barcode suddenly drops:</p> <div class="sourceCode" id="cb176"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb176-1" data-line-number="1">barcode_rank <-<span class="st"> </span><span class="kw">rank</span>(<span class="op">-</span>umi_per_barcode[,<span class="dv">2</span>])</a> <a class="sourceLine" id="cb176-2" data-line-number="2"><span class="kw">plot</span>(barcode_rank, umi_per_barcode[,<span class="dv">2</span>], <span class="dt">xlim=</span><span class="kw">c</span>(<span class="dv">1</span>,<span class="dv">8000</span>))</a></code></pre></div> +<p><img src="cell-calling_files/figure-html/unnamed-chunk-4-1.png" width="90%" style="display: block; margin: auto;" /></p> <p>Here we can see an roughly exponential curve of library sizes, so to make things simpler lets log-transform them.</p> <div class="sourceCode" id="cb177"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb177-1" data-line-number="1">log_lib_size <-<span class="st"> </span><span class="kw">log10</span>(umi_per_barcode[,<span class="dv">2</span>])</a> <a class="sourceLine" id="cb177-2" data-line-number="2"><span class="kw">plot</span>(barcode_rank, log_lib_size, <span class="dt">xlim=</span><span class="kw">c</span>(<span class="dv">1</span>,<span class="dv">8000</span>))</a></code></pre></div> +<p><img src="cell-calling_files/figure-html/unnamed-chunk-5-1.png" width="90%" style="display: block; margin: auto;" /></p> <p>That’s better, the “knee†in the distribution is much more pronounced. We could manually estimate where the “knee†is but it much more reproducible to algorithmically identify this point.</p> @@ -1073,30 +1075,36 @@ algorithmically identify this point.</p> <a class="sourceLine" id="cb178-7" data-line-number="7">inflection <-<span class="st"> </span><span class="kw">which</span>(rawdiff <span class="op">==</span><span class="st"> </span><span class="kw">min</span>(rawdiff[<span class="dv">100</span><span class="op">:</span><span class="kw">length</span>(rawdiff)], <span class="dt">na.rm=</span><span class="ot">TRUE</span>))</a> <a class="sourceLine" id="cb178-8" data-line-number="8"></a> <a class="sourceLine" id="cb178-9" data-line-number="9"><span class="kw">plot</span>(barcode_rank, log_lib_size, <span class="dt">xlim=</span><span class="kw">c</span>(<span class="dv">1</span>,<span class="dv">8000</span>))</a> -<a class="sourceLine" id="cb178-10" data-line-number="10"><span class="kw">abline</span>(<span class="dt">v=</span>inflection, <span class="dt">col=</span><span class="st">"red"</span>, <span class="dt">lwd=</span><span class="dv">2</span>)</a> -<a class="sourceLine" id="cb178-11" data-line-number="11"></a> -<a class="sourceLine" id="cb178-12" data-line-number="12">threshold <-<span class="st"> </span><span class="dv">10</span><span class="op">^</span>log_lib_size[inflection]</a> -<a class="sourceLine" id="cb178-13" data-line-number="13"></a> -<a class="sourceLine" id="cb178-14" data-line-number="14">cells <-<span class="st"> </span>umi_per_barcode[umi_per_barcode[,<span class="dv">2</span>] <span class="op">></span><span class="st"> </span>threshold,<span class="dv">1</span>]</a> -<a class="sourceLine" id="cb178-15" data-line-number="15">TPR <-<span class="st"> </span><span class="kw">sum</span>(cells <span class="op">%in%</span><span class="st"> </span>truth[,<span class="dv">1</span>])<span class="op">/</span><span class="kw">length</span>(cells)</a> -<a class="sourceLine" id="cb178-16" data-line-number="16">Recall <-<span class="st"> </span><span class="kw">sum</span>(cells <span class="op">%in%</span><span class="st"> </span>truth[,<span class="dv">1</span>])<span class="op">/</span><span class="kw">length</span>(truth[,<span class="dv">1</span>])</a> -<a class="sourceLine" id="cb178-17" data-line-number="17"><span class="kw">c</span>(TPR, Recall)</a></code></pre></div> +<a class="sourceLine" id="cb178-10" data-line-number="10"><span class="kw">abline</span>(<span class="dt">v=</span>inflection, <span class="dt">col=</span><span class="st">"red"</span>, <span class="dt">lwd=</span><span class="dv">2</span>)</a></code></pre></div> +<p><img src="cell-calling_files/figure-html/unnamed-chunk-6-1.png" width="90%" style="display: block; margin: auto;" /></p> +<div class="sourceCode" id="cb179"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb179-1" data-line-number="1">threshold <-<span class="st"> </span><span class="dv">10</span><span class="op">^</span>log_lib_size[inflection]</a> +<a class="sourceLine" id="cb179-2" data-line-number="2"></a> +<a class="sourceLine" id="cb179-3" data-line-number="3">cells <-<span class="st"> </span>umi_per_barcode[umi_per_barcode[,<span class="dv">2</span>] <span class="op">></span><span class="st"> </span>threshold,<span class="dv">1</span>]</a> +<a class="sourceLine" id="cb179-4" data-line-number="4">TPR <-<span class="st"> </span><span class="kw">sum</span>(cells <span class="op">%in%</span><span class="st"> </span>truth[,<span class="dv">1</span>])<span class="op">/</span><span class="kw">length</span>(cells)</a> +<a class="sourceLine" id="cb179-5" data-line-number="5">Recall <-<span class="st"> </span><span class="kw">sum</span>(cells <span class="op">%in%</span><span class="st"> </span>truth[,<span class="dv">1</span>])<span class="op">/</span><span class="kw">length</span>(truth[,<span class="dv">1</span>])</a> +<a class="sourceLine" id="cb179-6" data-line-number="6"><span class="kw">c</span>(TPR, Recall)</a></code></pre></div> +<pre><code>## [1] 1.0000000 0.7831707</code></pre> </div> <div id="mixture-model" class="section level3"> <h3><span class="header-section-number">5.9.2</span> Mixture model</h3> <p>Another is to fix a mixture model and find where the higher and lower distributions intersect. However, data may not fit the assumed distributions very well:</p> -<div class="sourceCode" id="cb179"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb179-1" data-line-number="1"><span class="kw">set.seed</span>(<span class="op">-</span><span class="dv">92497</span>)</a> -<a class="sourceLine" id="cb179-2" data-line-number="2"><span class="co"># mixture model</span></a> -<a class="sourceLine" id="cb179-3" data-line-number="3"><span class="kw">require</span>(<span class="st">"mixtools"</span>)</a> -<a class="sourceLine" id="cb179-4" data-line-number="4">mix <-<span class="st"> </span><span class="kw">normalmixEM</span>(log_lib_size)</a> -<a class="sourceLine" id="cb179-5" data-line-number="5"><span class="kw">plot</span>(mix, <span class="dt">which=</span><span class="dv">2</span>, <span class="dt">xlab2=</span><span class="st">"log(mol per cell)"</span>)</a> -<a class="sourceLine" id="cb179-6" data-line-number="6">p1 <-<span class="st"> </span><span class="kw">dnorm</span>(log_lib_size, <span class="dt">mean=</span>mix<span class="op">$</span>mu[<span class="dv">1</span>], <span class="dt">sd=</span>mix<span class="op">$</span>sigma[<span class="dv">1</span>])</a> -<a class="sourceLine" id="cb179-7" data-line-number="7">p2 <-<span class="st"> </span><span class="kw">dnorm</span>(log_lib_size, <span class="dt">mean=</span>mix<span class="op">$</span>mu[<span class="dv">2</span>], <span class="dt">sd=</span>mix<span class="op">$</span>sigma[<span class="dv">2</span>])</a> -<a class="sourceLine" id="cb179-8" data-line-number="8"><span class="cf">if</span> (mix<span class="op">$</span>mu[<span class="dv">1</span>] <span class="op"><</span><span class="st"> </span>mix<span class="op">$</span>mu[<span class="dv">2</span>]) {</a> -<a class="sourceLine" id="cb179-9" data-line-number="9"> split <-<span class="st"> </span><span class="kw">min</span>(log_lib_size[p2 <span class="op">></span><span class="st"> </span>p1])</a> -<a class="sourceLine" id="cb179-10" data-line-number="10">} <span class="cf">else</span> {</a> -<a class="sourceLine" id="cb179-11" data-line-number="11"> split <-<span class="st"> </span><span class="kw">min</span>(log_lib_size[p1 <span class="op">></span><span class="st"> </span>p2])</a> -<a class="sourceLine" id="cb179-12" data-line-number="12">}</a></code></pre></div> +<div class="sourceCode" id="cb181"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb181-1" data-line-number="1"><span class="kw">set.seed</span>(<span class="op">-</span><span class="dv">92497</span>)</a> +<a class="sourceLine" id="cb181-2" data-line-number="2"><span class="co"># mixture model</span></a> +<a class="sourceLine" id="cb181-3" data-line-number="3"><span class="kw">require</span>(<span class="st">"mixtools"</span>)</a></code></pre></div> +<pre><code>## Loading required package: mixtools</code></pre> +<pre><code>## mixtools package, version 1.1.0, Released 2017-03-10 +## This package is based upon work supported by the National Science Foundation under Grant No. SES-0518772.</code></pre> +<div class="sourceCode" id="cb184"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb184-1" data-line-number="1">mix <-<span class="st"> </span><span class="kw">normalmixEM</span>(log_lib_size)</a></code></pre></div> +<pre><code>## number of iterations= 43</code></pre> +<div class="sourceCode" id="cb186"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb186-1" data-line-number="1"><span class="kw">plot</span>(mix, <span class="dt">which=</span><span class="dv">2</span>, <span class="dt">xlab2=</span><span class="st">"log(mol per cell)"</span>)</a></code></pre></div> +<p><img src="cell-calling_files/figure-html/unnamed-chunk-7-1.png" width="90%" style="display: block; margin: auto;" /></p> +<div class="sourceCode" id="cb187"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb187-1" data-line-number="1">p1 <-<span class="st"> </span><span class="kw">dnorm</span>(log_lib_size, <span class="dt">mean=</span>mix<span class="op">$</span>mu[<span class="dv">1</span>], <span class="dt">sd=</span>mix<span class="op">$</span>sigma[<span class="dv">1</span>])</a> +<a class="sourceLine" id="cb187-2" data-line-number="2">p2 <-<span class="st"> </span><span class="kw">dnorm</span>(log_lib_size, <span class="dt">mean=</span>mix<span class="op">$</span>mu[<span class="dv">2</span>], <span class="dt">sd=</span>mix<span class="op">$</span>sigma[<span class="dv">2</span>])</a> +<a class="sourceLine" id="cb187-3" data-line-number="3"><span class="cf">if</span> (mix<span class="op">$</span>mu[<span class="dv">1</span>] <span class="op"><</span><span class="st"> </span>mix<span class="op">$</span>mu[<span class="dv">2</span>]) {</a> +<a class="sourceLine" id="cb187-4" data-line-number="4"> split <-<span class="st"> </span><span class="kw">min</span>(log_lib_size[p2 <span class="op">></span><span class="st"> </span>p1])</a> +<a class="sourceLine" id="cb187-5" data-line-number="5">} <span class="cf">else</span> {</a> +<a class="sourceLine" id="cb187-6" data-line-number="6"> split <-<span class="st"> </span><span class="kw">min</span>(log_lib_size[p1 <span class="op">></span><span class="st"> </span>p2])</a> +<a class="sourceLine" id="cb187-7" data-line-number="7">}</a></code></pre></div> <p><strong>Exercise</strong> Identify cells using this split point and calculate the TPR and Recall.</p> <p><strong>Answer</strong></p> @@ -1105,15 +1113,16 @@ Identify cells using this split point and calculate the TPR and Recall.</p> <h3><span class="header-section-number">5.9.3</span> Expected Number of Cells</h3> <p>A third method used by CellRanger V2, assumes a ~10-fold range of library sizes for real cells and estimates this range using the expected number of cells.</p> -<div class="sourceCode" id="cb180"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb180-1" data-line-number="1">n_cells <-<span class="st"> </span><span class="kw">length</span>(truth[,<span class="dv">1</span>])</a> -<a class="sourceLine" id="cb180-2" data-line-number="2"><span class="co"># CellRanger v2</span></a> -<a class="sourceLine" id="cb180-3" data-line-number="3">totals <-<span class="st"> </span>umi_per_barcode[,<span class="dv">2</span>]</a> -<a class="sourceLine" id="cb180-4" data-line-number="4">totals <-<span class="st"> </span><span class="kw">sort</span>(totals, <span class="dt">decreasing =</span> <span class="ot">TRUE</span>)</a> -<a class="sourceLine" id="cb180-5" data-line-number="5"><span class="co"># 99th percentile of top n_cells divided by 10</span></a> -<a class="sourceLine" id="cb180-6" data-line-number="6">thresh =<span class="st"> </span>totals[<span class="kw">round</span>(<span class="fl">0.01</span><span class="op">*</span>n_cells)]<span class="op">/</span><span class="dv">10</span></a> -<a class="sourceLine" id="cb180-7" data-line-number="7"><span class="kw">plot</span>(totals, <span class="dt">xlim=</span><span class="kw">c</span>(<span class="dv">1</span>,<span class="dv">8000</span>))</a> -<a class="sourceLine" id="cb180-8" data-line-number="8"><span class="kw">abline</span>(<span class="dt">h=</span>thresh, <span class="dt">col=</span><span class="st">"red"</span>, <span class="dt">lwd=</span><span class="dv">2</span>)</a></code></pre></div> -<p><strong>Exercise</strong> +<div class="sourceCode" id="cb188"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb188-1" data-line-number="1">n_cells <-<span class="st"> </span><span class="kw">length</span>(truth[,<span class="dv">1</span>])</a> +<a class="sourceLine" id="cb188-2" data-line-number="2"><span class="co"># CellRanger v2</span></a> +<a class="sourceLine" id="cb188-3" data-line-number="3">totals <-<span class="st"> </span>umi_per_barcode[,<span class="dv">2</span>]</a> +<a class="sourceLine" id="cb188-4" data-line-number="4">totals <-<span class="st"> </span><span class="kw">sort</span>(totals, <span class="dt">decreasing =</span> <span class="ot">TRUE</span>)</a> +<a class="sourceLine" id="cb188-5" data-line-number="5"><span class="co"># 99th percentile of top n_cells divided by 10</span></a> +<a class="sourceLine" id="cb188-6" data-line-number="6">thresh =<span class="st"> </span>totals[<span class="kw">round</span>(<span class="fl">0.01</span><span class="op">*</span>n_cells)]<span class="op">/</span><span class="dv">10</span></a> +<a class="sourceLine" id="cb188-7" data-line-number="7"><span class="kw">plot</span>(totals, <span class="dt">xlim=</span><span class="kw">c</span>(<span class="dv">1</span>,<span class="dv">8000</span>))</a> +<a class="sourceLine" id="cb188-8" data-line-number="8"><span class="kw">abline</span>(<span class="dt">h=</span>thresh, <span class="dt">col=</span><span class="st">"red"</span>, <span class="dt">lwd=</span><span class="dv">2</span>)</a></code></pre></div> +<p><img src="cell-calling_files/figure-html/unnamed-chunk-9-1.png" width="90%" style="display: block; margin: auto;" /> +<strong>Exercise</strong> Identify cells using this threshodl and calculate the TPR and Recall.</p> <p><strong>Answer</strong></p> </div> @@ -1128,44 +1137,44 @@ similar to the expression profile of the largests cells in a population. As such EmptyDrops is the only method able to identify barcodes for very small cells in highly diverse samples.</p> <p>Below we have provided code for how this method is currently run:</p> -<div class="sourceCode" id="cb181"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb181-1" data-line-number="1"><span class="kw">library</span>(<span class="st">"Matrix"</span>)</a> -<a class="sourceLine" id="cb181-2" data-line-number="2">raw.counts <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/pancreas/muraro.rds"</span>)</a> -<a class="sourceLine" id="cb181-3" data-line-number="3"></a> -<a class="sourceLine" id="cb181-4" data-line-number="4"><span class="kw">library</span>(<span class="st">"DropletUtils"</span>)</a> -<a class="sourceLine" id="cb181-5" data-line-number="5"><span class="kw">example</span>(write10xCounts, <span class="dt">echo=</span><span class="ot">FALSE</span>) </a> -<a class="sourceLine" id="cb181-6" data-line-number="6">dir.name <-<span class="st"> </span>tmpdir</a> -<a class="sourceLine" id="cb181-7" data-line-number="7"><span class="kw">list.files</span>(dir.name)</a> -<a class="sourceLine" id="cb181-8" data-line-number="8">sce <-<span class="st"> </span><span class="kw">read10xCounts</span>(dir.name)</a> -<a class="sourceLine" id="cb181-9" data-line-number="9">sce</a> -<a class="sourceLine" id="cb181-10" data-line-number="10">my.counts <-<span class="st"> </span>DropletUtils<span class="op">:::</span><span class="kw">simCounts</span>()</a> -<a class="sourceLine" id="cb181-11" data-line-number="11">br.out <-<span class="st"> </span><span class="kw">barcodeRanks</span>(my.counts)</a> -<a class="sourceLine" id="cb181-12" data-line-number="12"></a> -<a class="sourceLine" id="cb181-13" data-line-number="13"><span class="co"># Making a plot.</span></a> -<a class="sourceLine" id="cb181-14" data-line-number="14"><span class="kw">plot</span>(br.out<span class="op">$</span>rank, br.out<span class="op">$</span>total, <span class="dt">log=</span><span class="st">"xy"</span>, <span class="dt">xlab=</span><span class="st">"Rank"</span>, <span class="dt">ylab=</span><span class="st">"Total"</span>)</a> -<a class="sourceLine" id="cb181-15" data-line-number="15">o <-<span class="st"> </span><span class="kw">order</span>(br.out<span class="op">$</span>rank)</a> -<a class="sourceLine" id="cb181-16" data-line-number="16"><span class="kw">lines</span>(br.out<span class="op">$</span>rank[o], br.out<span class="op">$</span>fitted[o], <span class="dt">col=</span><span class="st">"red"</span>)</a> -<a class="sourceLine" id="cb181-17" data-line-number="17"></a> -<a class="sourceLine" id="cb181-18" data-line-number="18"><span class="kw">abline</span>(<span class="dt">h=</span><span class="kw">metadata</span>(br.out)<span class="op">$</span>knee, <span class="dt">col=</span><span class="st">"dodgerblue"</span>, <span class="dt">lty=</span><span class="dv">2</span>)</a> -<a class="sourceLine" id="cb181-19" data-line-number="19"><span class="kw">abline</span>(<span class="dt">h=</span><span class="kw">metadata</span>(br.out)<span class="op">$</span>inflection, <span class="dt">col=</span><span class="st">"forestgreen"</span>, <span class="dt">lty=</span><span class="dv">2</span>)</a> -<a class="sourceLine" id="cb181-20" data-line-number="20"><span class="kw">legend</span>(<span class="st">"bottomleft"</span>, <span class="dt">lty=</span><span class="dv">2</span>, <span class="dt">col=</span><span class="kw">c</span>(<span class="st">"dodgerblue"</span>, <span class="st">"forestgreen"</span>), </a> -<a class="sourceLine" id="cb181-21" data-line-number="21"> <span class="dt">legend=</span><span class="kw">c</span>(<span class="st">"knee"</span>, <span class="st">"inflection"</span>))</a> -<a class="sourceLine" id="cb181-22" data-line-number="22"><span class="co"># emptyDrops</span></a> -<a class="sourceLine" id="cb181-23" data-line-number="23"><span class="kw">set.seed</span>(<span class="dv">100</span>)</a> -<a class="sourceLine" id="cb181-24" data-line-number="24">e.out <-<span class="st"> </span><span class="kw">emptyDrops</span>(my.counts)</a> -<a class="sourceLine" id="cb181-25" data-line-number="25">is.cell <-<span class="st"> </span>e.out<span class="op">$</span>FDR <span class="op"><=</span><span class="st"> </span><span class="fl">0.01</span></a> -<a class="sourceLine" id="cb181-26" data-line-number="26"><span class="kw">sum</span>(is.cell, <span class="dt">na.rm=</span><span class="ot">TRUE</span>)</a> -<a class="sourceLine" id="cb181-27" data-line-number="27"></a> -<a class="sourceLine" id="cb181-28" data-line-number="28"><span class="kw">plot</span>(e.out<span class="op">$</span>Total, <span class="op">-</span>e.out<span class="op">$</span>LogProb, <span class="dt">col=</span><span class="kw">ifelse</span>(is.cell, <span class="st">"red"</span>, <span class="st">"black"</span>),</a> -<a class="sourceLine" id="cb181-29" data-line-number="29"> <span class="dt">xlab=</span><span class="st">"Total UMI count"</span>, <span class="dt">ylab=</span><span class="st">"-Log Probability"</span>)</a> -<a class="sourceLine" id="cb181-30" data-line-number="30"></a> -<a class="sourceLine" id="cb181-31" data-line-number="31"><span class="co"># plot(e.out$Total, -e.out$LogProb, col=ifelse(is.cell, "red", "black"),</span></a> -<a class="sourceLine" id="cb181-32" data-line-number="32"><span class="co"># xlab="Total UMI count", ylab="-Log Probability")</span></a> -<a class="sourceLine" id="cb181-33" data-line-number="33"><span class="co"># </span></a> -<a class="sourceLine" id="cb181-34" data-line-number="34"><span class="co"># cells <- colnames(raw.counts)[is.cell]</span></a> -<a class="sourceLine" id="cb181-35" data-line-number="35"><span class="co"># </span></a> -<a class="sourceLine" id="cb181-36" data-line-number="36"><span class="co"># TPR <- sum(cells %in% truth[,1])/length(cells)</span></a> -<a class="sourceLine" id="cb181-37" data-line-number="37"><span class="co"># Recall <- sum(cells %in% truth[,1])/length(truth[,1])</span></a> -<a class="sourceLine" id="cb181-38" data-line-number="38"><span class="co"># c(TPR, Recall)</span></a></code></pre></div> +<div class="sourceCode" id="cb189"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb189-1" data-line-number="1"><span class="kw">library</span>(<span class="st">"Matrix"</span>)</a> +<a class="sourceLine" id="cb189-2" data-line-number="2">raw.counts <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/pancreas/muraro.rds"</span>)</a> +<a class="sourceLine" id="cb189-3" data-line-number="3"></a> +<a class="sourceLine" id="cb189-4" data-line-number="4"><span class="kw">library</span>(<span class="st">"DropletUtils"</span>)</a> +<a class="sourceLine" id="cb189-5" data-line-number="5"><span class="kw">example</span>(write10xCounts, <span class="dt">echo=</span><span class="ot">FALSE</span>) </a> +<a class="sourceLine" id="cb189-6" data-line-number="6">dir.name <-<span class="st"> </span>tmpdir</a> +<a class="sourceLine" id="cb189-7" data-line-number="7"><span class="kw">list.files</span>(dir.name)</a> +<a class="sourceLine" id="cb189-8" data-line-number="8">sce <-<span class="st"> </span><span class="kw">read10xCounts</span>(dir.name)</a> +<a class="sourceLine" id="cb189-9" data-line-number="9">sce</a> +<a class="sourceLine" id="cb189-10" data-line-number="10">my.counts <-<span class="st"> </span>DropletUtils<span class="op">:::</span><span class="kw">simCounts</span>()</a> +<a class="sourceLine" id="cb189-11" data-line-number="11">br.out <-<span class="st"> </span><span class="kw">barcodeRanks</span>(my.counts)</a> +<a class="sourceLine" id="cb189-12" data-line-number="12"></a> +<a class="sourceLine" id="cb189-13" data-line-number="13"><span class="co"># Making a plot.</span></a> +<a class="sourceLine" id="cb189-14" data-line-number="14"><span class="kw">plot</span>(br.out<span class="op">$</span>rank, br.out<span class="op">$</span>total, <span class="dt">log=</span><span class="st">"xy"</span>, <span class="dt">xlab=</span><span class="st">"Rank"</span>, <span class="dt">ylab=</span><span class="st">"Total"</span>)</a> +<a class="sourceLine" id="cb189-15" data-line-number="15">o <-<span class="st"> </span><span class="kw">order</span>(br.out<span class="op">$</span>rank)</a> +<a class="sourceLine" id="cb189-16" data-line-number="16"><span class="kw">lines</span>(br.out<span class="op">$</span>rank[o], br.out<span class="op">$</span>fitted[o], <span class="dt">col=</span><span class="st">"red"</span>)</a> +<a class="sourceLine" id="cb189-17" data-line-number="17"></a> +<a class="sourceLine" id="cb189-18" data-line-number="18"><span class="kw">abline</span>(<span class="dt">h=</span><span class="kw">metadata</span>(br.out)<span class="op">$</span>knee, <span class="dt">col=</span><span class="st">"dodgerblue"</span>, <span class="dt">lty=</span><span class="dv">2</span>)</a> +<a class="sourceLine" id="cb189-19" data-line-number="19"><span class="kw">abline</span>(<span class="dt">h=</span><span class="kw">metadata</span>(br.out)<span class="op">$</span>inflection, <span class="dt">col=</span><span class="st">"forestgreen"</span>, <span class="dt">lty=</span><span class="dv">2</span>)</a> +<a class="sourceLine" id="cb189-20" data-line-number="20"><span class="kw">legend</span>(<span class="st">"bottomleft"</span>, <span class="dt">lty=</span><span class="dv">2</span>, <span class="dt">col=</span><span class="kw">c</span>(<span class="st">"dodgerblue"</span>, <span class="st">"forestgreen"</span>), </a> +<a class="sourceLine" id="cb189-21" data-line-number="21"> <span class="dt">legend=</span><span class="kw">c</span>(<span class="st">"knee"</span>, <span class="st">"inflection"</span>))</a> +<a class="sourceLine" id="cb189-22" data-line-number="22"><span class="co"># emptyDrops</span></a> +<a class="sourceLine" id="cb189-23" data-line-number="23"><span class="kw">set.seed</span>(<span class="dv">100</span>)</a> +<a class="sourceLine" id="cb189-24" data-line-number="24">e.out <-<span class="st"> </span><span class="kw">emptyDrops</span>(my.counts)</a> +<a class="sourceLine" id="cb189-25" data-line-number="25">is.cell <-<span class="st"> </span>e.out<span class="op">$</span>FDR <span class="op"><=</span><span class="st"> </span><span class="fl">0.01</span></a> +<a class="sourceLine" id="cb189-26" data-line-number="26"><span class="kw">sum</span>(is.cell, <span class="dt">na.rm=</span><span class="ot">TRUE</span>)</a> +<a class="sourceLine" id="cb189-27" data-line-number="27"></a> +<a class="sourceLine" id="cb189-28" data-line-number="28"><span class="kw">plot</span>(e.out<span class="op">$</span>Total, <span class="op">-</span>e.out<span class="op">$</span>LogProb, <span class="dt">col=</span><span class="kw">ifelse</span>(is.cell, <span class="st">"red"</span>, <span class="st">"black"</span>),</a> +<a class="sourceLine" id="cb189-29" data-line-number="29"> <span class="dt">xlab=</span><span class="st">"Total UMI count"</span>, <span class="dt">ylab=</span><span class="st">"-Log Probability"</span>)</a> +<a class="sourceLine" id="cb189-30" data-line-number="30"></a> +<a class="sourceLine" id="cb189-31" data-line-number="31"><span class="co"># plot(e.out$Total, -e.out$LogProb, col=ifelse(is.cell, "red", "black"),</span></a> +<a class="sourceLine" id="cb189-32" data-line-number="32"><span class="co"># xlab="Total UMI count", ylab="-Log Probability")</span></a> +<a class="sourceLine" id="cb189-33" data-line-number="33"><span class="co"># </span></a> +<a class="sourceLine" id="cb189-34" data-line-number="34"><span class="co"># cells <- colnames(raw.counts)[is.cell]</span></a> +<a class="sourceLine" id="cb189-35" data-line-number="35"><span class="co"># </span></a> +<a class="sourceLine" id="cb189-36" data-line-number="36"><span class="co"># TPR <- sum(cells %in% truth[,1])/length(cells)</span></a> +<a class="sourceLine" id="cb189-37" data-line-number="37"><span class="co"># Recall <- sum(cells %in% truth[,1])/length(truth[,1])</span></a> +<a class="sourceLine" id="cb189-38" data-line-number="38"><span class="co"># c(TPR, Recall)</span></a></code></pre></div> </div> </div> diff --git a/public/pseudotime.md b/public/pseudotime.md index 6c8dddd4d654f311e74f3095ef75b0d90981d421..331f88edf27d1abb2737462ad9c7fb6e8466e944 100644 --- a/public/pseudotime.md +++ b/public/pseudotime.md @@ -322,6 +322,46 @@ _Note_ You can also supply a start and an end cluster to `slingshot`. _Comments_ Did you notice the ordering of clusters in the lineage prediced for `16cells` state? There is an outlier-like cell in the 16cell group, find the outlier and remove it, then re-run `Slingshot`. +### GAM general additive model for identifying temporally expressed genes + +After running slingshot, an interesting next step may be to find genes that change their expression over the course of development. We demonstrate one possible method for this type of analysis on the 100 most variable genes. We will regress each gene on the pseudotime variable we have generated, using a general additive model (GAM). This allows us to detect non-linear patterns in gene expression. + + +```r +library(gam) +t <- deng_SCE$slingPseudotime_1 + +# for time, only look at the 100 most variable genes +Y <- log1p(assay(deng_SCE,"logcounts")) + +var100 <- names(sort(apply(Y,1,var),decreasing = TRUE))[1:100] +Y <- Y[var100,] + +# fit a GAM with a loess term for pseudotime +gam.pval <- apply(Y,1,function(z){ + d <- data.frame(z=z, t=t) + suppressWarnings({ + tmp <- gam(z ~ lo(t), data=d) + }) + p <- summary(tmp)[3][[1]][2,3] + p +}) + +## Plot the top 100 genes' expression + +topgenes <- names(sort(gam.pval, decreasing = FALSE))[1:100] + +heatdata <- assays(deng_SCE)$logcounts[topgenes, order(t, na.last = NA)] +heatclus <- deng_SCE$cell_type2[order(t, na.last = NA)] + +heatmap(heatdata, Colv = NA, + ColSideColors = my_color[heatclus],cexRow = 1,cexCol = 1) +``` + +<img src="pseudotime_files/figure-html/gam_tm_deg-1.png" width="90%" style="display: block; margin: auto;" /> + + +We will regress each gene on the pseudotime variable we have generated, using a general additive model (GAM). This allows us to detect non-linear patterns in gene expression. ### Monocle @@ -1095,133 +1135,134 @@ plot_dimred( ## [3] lle_1.1 snowfall_1.84-6.1 ## [5] snow_0.4-3 MASS_7.3-51.1 ## [7] scatterplot3d_0.3-41 monocle3_0.2.0 -## [9] ouija_0.99.0 Rcpp_1.0.2 -## [11] SLICER_0.2.0 slingshot_1.2.0 -## [13] princurve_2.1.4 Polychrome_1.2.3 -## [15] corrplot_0.84 ggbeeswarm_0.6.0 -## [17] ggthemes_4.2.0 scater_1.12.2 -## [19] destiny_2.14.0 monocle_2.12.0 -## [21] DDRTree_0.1.5 irlba_2.3.3 -## [23] VGAM_1.1-1 ggplot2_3.2.1 -## [25] Matrix_1.2-17 M3Drop_1.10.0 -## [27] numDeriv_2016.8-1.1 TSCAN_1.22.0 -## [29] SingleCellExperiment_1.6.0 SummarizedExperiment_1.14.1 -## [31] DelayedArray_0.10.0 BiocParallel_1.18.1 -## [33] matrixStats_0.55.0 Biobase_2.44.0 -## [35] GenomicRanges_1.36.1 GenomeInfoDb_1.20.0 -## [37] IRanges_2.18.3 S4Vectors_0.22.1 -## [39] BiocGenerics_0.30.0 +## [9] gam_1.16.1 foreach_1.4.7 +## [11] ouija_0.99.0 Rcpp_1.0.2 +## [13] SLICER_0.2.0 slingshot_1.2.0 +## [15] princurve_2.1.4 Polychrome_1.2.3 +## [17] corrplot_0.84 ggbeeswarm_0.6.0 +## [19] ggthemes_4.2.0 scater_1.12.2 +## [21] destiny_2.14.0 monocle_2.12.0 +## [23] DDRTree_0.1.5 irlba_2.3.3 +## [25] VGAM_1.1-1 ggplot2_3.2.1 +## [27] Matrix_1.2-17 M3Drop_1.10.0 +## [29] numDeriv_2016.8-1.1 TSCAN_1.22.0 +## [31] SingleCellExperiment_1.6.0 SummarizedExperiment_1.14.1 +## [33] DelayedArray_0.10.0 BiocParallel_1.18.1 +## [35] matrixStats_0.55.0 Biobase_2.44.0 +## [37] GenomicRanges_1.36.1 GenomeInfoDb_1.20.0 +## [39] IRanges_2.18.3 S4Vectors_0.22.1 +## [41] BiocGenerics_0.30.0 ## ## loaded via a namespace (and not attached): ## [1] rgl_0.100.30 rsvd_1.0.2 ## [3] vcd_1.4-4 Hmisc_4.2-0 ## [5] zinbwave_1.6.0 corpcor_1.6.9 ## [7] ps_1.3.0 class_7.3-15 -## [9] foreach_1.4.7 lmtest_0.9-37 -## [11] glmnet_2.0-18 crayon_1.3.4 -## [13] laeken_0.5.0 nlme_3.1-139 -## [15] backports_1.1.4 qlcMatrix_0.9.7 -## [17] rlang_0.4.0 XVector_0.24.0 -## [19] readxl_1.3.1 callr_3.3.2 -## [21] limma_3.40.6 phylobase_0.8.6 -## [23] smoother_1.1 manipulateWidget_0.10.0 -## [25] bit64_0.9-7 loo_2.1.0 -## [27] glue_1.3.1 pheatmap_1.0.12 -## [29] rngtools_1.4 splancs_2.01-40 -## [31] processx_3.4.1 vipor_0.4.5 -## [33] AnnotationDbi_1.46.1 haven_2.1.1 -## [35] tidyselect_0.2.5 rio_0.5.16 -## [37] XML_3.98-1.20 tidyr_1.0.0 -## [39] zoo_1.8-6 xtable_1.8-4 -## [41] magrittr_1.5 evaluate_0.14 -## [43] bibtex_0.4.2 cli_1.1.0 -## [45] zlibbioc_1.30.0 rstudioapi_0.10 -## [47] miniUI_0.1.1.1 sp_1.3-1 -## [49] rpart_4.1-15 locfdr_1.1-8 -## [51] RcppEigen_0.3.3.5.0 shiny_1.3.2 -## [53] BiocSingular_1.0.0 xfun_0.9 -## [55] leidenbase_0.1.0 inline_0.3.15 -## [57] pkgbuild_1.0.5 cluster_2.1.0 -## [59] caTools_1.17.1.2 sgeostat_1.0-27 -## [61] tibble_2.1.3 ggrepel_0.8.1 -## [63] ape_5.3 stabledist_0.7-1 -## [65] zeallot_0.1.0 withr_2.1.2 -## [67] bitops_1.0-6 slam_0.1-45 -## [69] ranger_0.11.2 plyr_1.8.4 -## [71] cellranger_1.1.0 pcaPP_1.9-73 -## [73] sparsesvd_0.2 coda_0.19-3 -## [75] e1071_1.7-2 RcppParallel_4.4.3 -## [77] pillar_1.4.2 gplots_3.0.1.1 -## [79] reldist_1.6-6 kernlab_0.9-27 -## [81] TTR_0.23-5 ellipsis_0.3.0 -## [83] tripack_1.3-8 DelayedMatrixStats_1.6.1 -## [85] xts_0.11-2 vctrs_0.2.0 -## [87] NMF_0.21.0 tools_3.6.0 -## [89] foreign_0.8-70 rncl_0.8.3 -## [91] beeswarm_0.2.3 munsell_0.5.0 -## [93] proxy_0.4-23 HSMMSingleCell_1.4.0 -## [95] compiler_3.6.0 abind_1.4-5 -## [97] httpuv_1.5.2 pkgmaker_0.27 -## [99] GenomeInfoDbData_1.2.1 gridExtra_2.3 -## [101] edgeR_3.26.8 lattice_0.20-38 -## [103] deldir_0.1-23 utf8_1.1.4 -## [105] later_0.8.0 dplyr_0.8.3 -## [107] jsonlite_1.6 scales_1.0.0 -## [109] docopt_0.6.1 carData_3.0-2 -## [111] genefilter_1.66.0 lazyeval_0.2.2 -## [113] promises_1.0.1 spatstat_1.61-0 -## [115] car_3.0-3 doParallel_1.0.15 -## [117] latticeExtra_0.6-28 R.utils_2.9.0 -## [119] goftest_1.1-1 spatstat.utils_1.13-0 -## [121] checkmate_1.9.4 cowplot_1.0.0 -## [123] rmarkdown_1.15 openxlsx_4.1.0.1 -## [125] statmod_1.4.32 webshot_0.5.1 -## [127] Rtsne_0.15 forcats_0.4.0 -## [129] copula_0.999-19.1 softImpute_1.4 -## [131] uwot_0.1.4 igraph_1.2.4.1 -## [133] HDF5Array_1.12.2 survival_2.43-3 -## [135] yaml_2.2.0 htmltools_0.3.6 -## [137] memoise_1.1.0 locfit_1.5-9.1 -## [139] viridisLite_0.3.0 digest_0.6.21 -## [141] assertthat_0.2.1 mime_0.7 -## [143] densityClust_0.3 registry_0.5-1 -## [145] RSQLite_2.1.2 data.table_1.12.2 -## [147] blob_1.2.0 R.oo_1.22.0 -## [149] RNeXML_2.3.0 labeling_0.3 -## [151] fastICA_1.2-2 Formula_1.2-3 -## [153] Rhdf5lib_1.6.1 RCurl_1.95-4.12 -## [155] hms_0.5.1 rhdf5_2.28.0 -## [157] colorspace_1.4-1 base64enc_0.1-3 -## [159] nnet_7.3-12 ADGofTest_0.3 -## [161] mclust_5.4.5 bookdown_0.13 -## [163] RANN_2.6.1 mvtnorm_1.0-11 -## [165] fansi_0.4.0 pspline_1.0-18 -## [167] VIM_4.8.0 R6_2.4.0 -## [169] grid_3.6.0 lifecycle_0.1.0 -## [171] acepack_1.4.1 zip_2.0.4 -## [173] curl_4.2 gdata_2.18.0 -## [175] robustbase_0.93-5 howmany_0.3-1 -## [177] RcppAnnoy_0.0.13 RColorBrewer_1.1-2 -## [179] MCMCglmm_2.29 iterators_1.0.12 -## [181] alphahull_2.2 stringr_1.4.0 -## [183] htmlwidgets_1.3 polyclip_1.10-0 -## [185] purrr_0.3.2 crosstalk_1.0.0 -## [187] mgcv_1.8-28 tensorA_0.36.1 -## [189] htmlTable_1.13.2 clusterExperiment_2.4.4 -## [191] codetools_0.2-16 FNN_1.1.3 -## [193] gtools_3.8.1 prettyunits_1.0.2 -## [195] gridBase_0.4-7 RSpectra_0.15-0 -## [197] R.methodsS3_1.7.1 gtable_0.3.0 -## [199] DBI_1.0.0 highr_0.8 -## [201] tensor_1.5 httr_1.4.1 -## [203] KernSmooth_2.23-15 stringi_1.4.3 -## [205] progress_1.2.2 reshape2_1.4.3 -## [207] uuid_0.1-2 cubature_2.0.3 -## [209] annotate_1.62.0 viridis_0.5.1 -## [211] xml2_1.2.2 combinat_0.0-8 -## [213] bbmle_1.0.20 boot_1.3-20 -## [215] BiocNeighbors_1.2.0 ade4_1.7-13 -## [217] DEoptimR_1.0-8 bit_1.1-14 -## [219] spatstat.data_1.4-0 pkgconfig_2.0.3 -## [221] gsl_2.1-6 knitr_1.25 +## [9] lmtest_0.9-37 glmnet_2.0-18 +## [11] crayon_1.3.4 laeken_0.5.0 +## [13] nlme_3.1-139 backports_1.1.4 +## [15] qlcMatrix_0.9.7 rlang_0.4.0 +## [17] XVector_0.24.0 readxl_1.3.1 +## [19] callr_3.3.2 limma_3.40.6 +## [21] phylobase_0.8.6 smoother_1.1 +## [23] manipulateWidget_0.10.0 bit64_0.9-7 +## [25] loo_2.1.0 glue_1.3.1 +## [27] pheatmap_1.0.12 rngtools_1.4 +## [29] splancs_2.01-40 processx_3.4.1 +## [31] vipor_0.4.5 AnnotationDbi_1.46.1 +## [33] haven_2.1.1 tidyselect_0.2.5 +## [35] rio_0.5.16 XML_3.98-1.20 +## [37] tidyr_1.0.0 zoo_1.8-6 +## [39] xtable_1.8-4 magrittr_1.5 +## [41] evaluate_0.14 bibtex_0.4.2 +## [43] cli_1.1.0 zlibbioc_1.30.0 +## [45] rstudioapi_0.10 miniUI_0.1.1.1 +## [47] sp_1.3-1 rpart_4.1-15 +## [49] locfdr_1.1-8 RcppEigen_0.3.3.5.0 +## [51] shiny_1.3.2 BiocSingular_1.0.0 +## [53] xfun_0.9 leidenbase_0.1.0 +## [55] inline_0.3.15 pkgbuild_1.0.5 +## [57] cluster_2.1.0 caTools_1.17.1.2 +## [59] sgeostat_1.0-27 tibble_2.1.3 +## [61] ggrepel_0.8.1 ape_5.3 +## [63] stabledist_0.7-1 zeallot_0.1.0 +## [65] withr_2.1.2 bitops_1.0-6 +## [67] slam_0.1-45 ranger_0.11.2 +## [69] plyr_1.8.4 cellranger_1.1.0 +## [71] pcaPP_1.9-73 sparsesvd_0.2 +## [73] coda_0.19-3 e1071_1.7-2 +## [75] RcppParallel_4.4.3 pillar_1.4.2 +## [77] gplots_3.0.1.1 reldist_1.6-6 +## [79] kernlab_0.9-27 TTR_0.23-5 +## [81] ellipsis_0.3.0 tripack_1.3-8 +## [83] DelayedMatrixStats_1.6.1 xts_0.11-2 +## [85] vctrs_0.2.0 NMF_0.21.0 +## [87] tools_3.6.0 foreign_0.8-70 +## [89] rncl_0.8.3 beeswarm_0.2.3 +## [91] munsell_0.5.0 proxy_0.4-23 +## [93] HSMMSingleCell_1.4.0 compiler_3.6.0 +## [95] abind_1.4-5 httpuv_1.5.2 +## [97] pkgmaker_0.27 GenomeInfoDbData_1.2.1 +## [99] gridExtra_2.3 edgeR_3.26.8 +## [101] lattice_0.20-38 deldir_0.1-23 +## [103] utf8_1.1.4 later_0.8.0 +## [105] dplyr_0.8.3 jsonlite_1.6 +## [107] scales_1.0.0 docopt_0.6.1 +## [109] carData_3.0-2 genefilter_1.66.0 +## [111] lazyeval_0.2.2 promises_1.0.1 +## [113] spatstat_1.61-0 car_3.0-3 +## [115] doParallel_1.0.15 latticeExtra_0.6-28 +## [117] R.utils_2.9.0 goftest_1.1-1 +## [119] spatstat.utils_1.13-0 checkmate_1.9.4 +## [121] cowplot_1.0.0 rmarkdown_1.15 +## [123] openxlsx_4.1.0.1 statmod_1.4.32 +## [125] webshot_0.5.1 Rtsne_0.15 +## [127] forcats_0.4.0 copula_0.999-19.1 +## [129] softImpute_1.4 uwot_0.1.4 +## [131] igraph_1.2.4.1 HDF5Array_1.12.2 +## [133] survival_2.43-3 yaml_2.2.0 +## [135] htmltools_0.3.6 memoise_1.1.0 +## [137] locfit_1.5-9.1 viridisLite_0.3.0 +## [139] digest_0.6.21 assertthat_0.2.1 +## [141] mime_0.7 densityClust_0.3 +## [143] registry_0.5-1 RSQLite_2.1.2 +## [145] data.table_1.12.2 blob_1.2.0 +## [147] R.oo_1.22.0 RNeXML_2.3.0 +## [149] labeling_0.3 fastICA_1.2-2 +## [151] Formula_1.2-3 Rhdf5lib_1.6.1 +## [153] RCurl_1.95-4.12 hms_0.5.1 +## [155] rhdf5_2.28.0 colorspace_1.4-1 +## [157] base64enc_0.1-3 nnet_7.3-12 +## [159] ADGofTest_0.3 mclust_5.4.5 +## [161] bookdown_0.13 RANN_2.6.1 +## [163] mvtnorm_1.0-11 fansi_0.4.0 +## [165] pspline_1.0-18 VIM_4.8.0 +## [167] R6_2.4.0 grid_3.6.0 +## [169] lifecycle_0.1.0 acepack_1.4.1 +## [171] zip_2.0.4 curl_4.2 +## [173] gdata_2.18.0 robustbase_0.93-5 +## [175] howmany_0.3-1 RcppAnnoy_0.0.13 +## [177] RColorBrewer_1.1-2 MCMCglmm_2.29 +## [179] iterators_1.0.12 alphahull_2.2 +## [181] stringr_1.4.0 htmlwidgets_1.3 +## [183] polyclip_1.10-0 purrr_0.3.2 +## [185] crosstalk_1.0.0 mgcv_1.8-28 +## [187] tensorA_0.36.1 htmlTable_1.13.2 +## [189] clusterExperiment_2.4.4 codetools_0.2-16 +## [191] FNN_1.1.3 gtools_3.8.1 +## [193] prettyunits_1.0.2 gridBase_0.4-7 +## [195] RSpectra_0.15-0 R.methodsS3_1.7.1 +## [197] gtable_0.3.0 DBI_1.0.0 +## [199] highr_0.8 tensor_1.5 +## [201] httr_1.4.1 KernSmooth_2.23-15 +## [203] stringi_1.4.3 progress_1.2.2 +## [205] reshape2_1.4.3 uuid_0.1-2 +## [207] cubature_2.0.3 annotate_1.62.0 +## [209] viridis_0.5.1 xml2_1.2.2 +## [211] combinat_0.0-8 bbmle_1.0.20 +## [213] boot_1.3-20 BiocNeighbors_1.2.0 +## [215] ade4_1.7-13 DEoptimR_1.0-8 +## [217] bit_1.1-14 spatstat.data_1.4-0 +## [219] pkgconfig_2.0.3 gsl_2.1-6 +## [221] knitr_1.25 ``` diff --git a/public/pseudotime_files/figure-html/gam_tm_deg-1.png b/public/pseudotime_files/figure-html/gam_tm_deg-1.png new file mode 100644 index 0000000000000000000000000000000000000000..47ddec458b61b66586458e9137a004b36c4ef33d Binary files /dev/null and b/public/pseudotime_files/figure-html/gam_tm_deg-1.png differ diff --git a/public/quality-control-and-data-visualisation.html b/public/quality-control-and-data-visualisation.html index e22f6820b66d386a707d277efea30854c841a83e..905d37d3a4382faca79fbe02d371e051b5704fa8 100644 --- a/public/quality-control-and-data-visualisation.html +++ b/public/quality-control-and-data-visualisation.html @@ -339,7 +339,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.4.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-3"><i class="fa fa-check"></i><b>7.4.6</b> sessionInfo()</a></li> </ul></li> <li class="chapter" data-level="7.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#identifying-confounding-factors-reads"><i class="fa fa-check"></i><b>7.5</b> Identifying confounding factors (Reads)</a></li> -<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#dealing-with-confounders"><i class="fa fa-check"></i><b>7.6</b> Dealing with confounders</a><ul> +<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#batch-effects"><i class="fa fa-check"></i><b>7.6</b> Batch effects</a><ul> <li class="chapter" data-level="7.6.1" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#introduction-6"><i class="fa fa-check"></i><b>7.6.1</b> Introduction</a></li> <li class="chapter" data-level="7.6.2" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#linear-models"><i class="fa fa-check"></i><b>7.6.2</b> Linear models</a></li> <li class="chapter" data-level="7.6.3" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sctransform-2"><i class="fa fa-check"></i><b>7.6.3</b> sctransform</a></li> @@ -347,7 +347,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.6.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#combat"><i class="fa fa-check"></i><b>7.6.5</b> Combat</a></li> <li class="chapter" data-level="7.6.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#mnncorrect"><i class="fa fa-check"></i><b>7.6.6</b> mnnCorrect</a></li> <li class="chapter" data-level="7.6.7" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#harmony"><i class="fa fa-check"></i><b>7.6.7</b> Harmony</a></li> -<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-confounder-removal-strategies"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare confounder removal strategies</a></li> +<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-batch-correction"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare batch correction</a></li> <li class="chapter" data-level="7.6.9" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#big-exercise-2"><i class="fa fa-check"></i><b>7.6.9</b> Big Exercise</a></li> <li class="chapter" data-level="7.6.10" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-4"><i class="fa fa-check"></i><b>7.6.10</b> sessionInfo()</a></li> </ul></li> @@ -370,14 +370,13 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="9.1.2" data-path="latent-spaces.html"><a href="latent-spaces.html#tsne-t-distributed-stochastic-neighbor-embedding"><i class="fa fa-check"></i><b>9.1.2</b> tSNE: t-Distributed Stochastic Neighbor Embedding</a></li> <li class="chapter" data-level="9.1.3" data-path="latent-spaces.html"><a href="latent-spaces.html#manifold-methods"><i class="fa fa-check"></i><b>9.1.3</b> Manifold methods</a></li> </ul></li> -<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a></li> +<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a><ul> +<li class="chapter" data-level="9.2.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom-interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.2.1</b> <span>Slalom</span>: Interpretable latent spaces</a></li> +</ul></li> <li class="chapter" data-level="9.3" data-path="latent-spaces.html"><a href="latent-spaces.html#autoencoders"><i class="fa fa-check"></i><b>9.3</b> Autoencoders</a><ul> <li class="chapter" data-level="9.3.1" data-path="latent-spaces.html"><a href="latent-spaces.html#background-and-some-notations"><i class="fa fa-check"></i><b>9.3.1</b> Background and some notations</a></li> <li class="chapter" data-level="9.3.2" data-path="latent-spaces.html"><a href="latent-spaces.html#objective"><i class="fa fa-check"></i><b>9.3.2</b> Objective</a></li> </ul></li> -<li class="chapter" data-level="9.4" data-path="latent-spaces.html"><a href="latent-spaces.html#interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.4</b> Interpretable latent spaces</a><ul> -<li class="chapter" data-level="9.4.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom"><i class="fa fa-check"></i><b>9.4.1</b> Slalom</a></li> -</ul></li> </ul></li> <li class="chapter" data-level="10" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html"><i class="fa fa-check"></i><b>10</b> Clustering and cell annotation</a><ul> <li class="chapter" data-level="10.1" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html#clustering-methods"><i class="fa fa-check"></i><b>10.1</b> Clustering Methods</a><ul> @@ -399,15 +398,16 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="11.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#first-look-at-deng-data"><i class="fa fa-check"></i><b>11.1</b> First look at Deng data</a><ul> <li class="chapter" data-level="11.1.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#tscan"><i class="fa fa-check"></i><b>11.1.1</b> TSCAN</a></li> <li class="chapter" data-level="11.1.2" data-path="trajectory-inference.html"><a href="trajectory-inference.html#slingshot"><i class="fa fa-check"></i><b>11.1.2</b> Slingshot</a></li> -<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.3</b> Monocle</a></li> -<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.4</b> Monocle 2</a></li> -<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 3</a></li> -<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.6</b> Diffusion maps</a></li> -<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.7</b> Other methods</a></li> -<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.8</b> Comparison of the methods</a></li> -<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.9</b> Expression of genes through time</a></li> -<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.10</b> dynverse</a></li> -<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.11</b> sessionInfo()</a></li> +<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#gam-general-additive-model-for-identifying-temporally-expressed-genes"><i class="fa fa-check"></i><b>11.1.3</b> GAM general additive model for identifying temporally expressed genes</a></li> +<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.4</b> Monocle</a></li> +<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 2</a></li> +<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.6</b> Monocle 3</a></li> +<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.7</b> Diffusion maps</a></li> +<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.8</b> Other methods</a></li> +<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.9</b> Comparison of the methods</a></li> +<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.10</b> Expression of genes through time</a></li> +<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.11</b> dynverse</a></li> +<li class="chapter" data-level="11.1.12" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.12</b> sessionInfo()</a></li> </ul></li> </ul></li> <li class="chapter" data-level="12" data-path="dechapter.html"><a href="dechapter.html"><i class="fa fa-check"></i><b>12</b> Differential Expression (DE) analysis</a><ul> @@ -541,14 +541,14 @@ facilitate the quantification both unique molecular identifiers (UMIs) and ERCC <em>spike-ins</em> were used. The data files are located in the <code>tung</code> folder in your working directory. These files are the copies of the original files made on the 15/03/16. We will use these copies for reproducibility purposes.</p> -<div class="sourceCode" id="cb182"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb182-1" data-line-number="1"><span class="kw">library</span>(SingleCellExperiment)</a> -<a class="sourceLine" id="cb182-2" data-line-number="2"><span class="kw">library</span>(scater)</a> -<a class="sourceLine" id="cb182-3" data-line-number="3"><span class="kw">options</span>(<span class="dt">stringsAsFactors =</span> <span class="ot">FALSE</span>)</a></code></pre></div> +<div class="sourceCode" id="cb190"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb190-1" data-line-number="1"><span class="kw">library</span>(SingleCellExperiment)</a> +<a class="sourceLine" id="cb190-2" data-line-number="2"><span class="kw">library</span>(scater)</a> +<a class="sourceLine" id="cb190-3" data-line-number="3"><span class="kw">options</span>(<span class="dt">stringsAsFactors =</span> <span class="ot">FALSE</span>)</a></code></pre></div> <p>Load the data and annotations:</p> -<div class="sourceCode" id="cb183"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb183-1" data-line-number="1">molecules <-<span class="st"> </span><span class="kw">read.table</span>(<span class="st">"data/tung/molecules.txt"</span>, <span class="dt">sep =</span> <span class="st">"</span><span class="ch">\t</span><span class="st">"</span>)</a> -<a class="sourceLine" id="cb183-2" data-line-number="2">anno <-<span class="st"> </span><span class="kw">read.table</span>(<span class="st">"data/tung/annotation.txt"</span>, <span class="dt">sep =</span> <span class="st">"</span><span class="ch">\t</span><span class="st">"</span>, <span class="dt">header =</span> <span class="ot">TRUE</span>)</a></code></pre></div> +<div class="sourceCode" id="cb191"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb191-1" data-line-number="1">molecules <-<span class="st"> </span><span class="kw">read.table</span>(<span class="st">"data/tung/molecules.txt"</span>, <span class="dt">sep =</span> <span class="st">"</span><span class="ch">\t</span><span class="st">"</span>)</a> +<a class="sourceLine" id="cb191-2" data-line-number="2">anno <-<span class="st"> </span><span class="kw">read.table</span>(<span class="st">"data/tung/annotation.txt"</span>, <span class="dt">sep =</span> <span class="st">"</span><span class="ch">\t</span><span class="st">"</span>, <span class="dt">header =</span> <span class="ot">TRUE</span>)</a></code></pre></div> <p>Inspect a small portion of the expression matrix</p> -<div class="sourceCode" id="cb184"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb184-1" data-line-number="1"><span class="kw">head</span>(molecules[ , <span class="dv">1</span><span class="op">:</span><span class="dv">3</span>])</a></code></pre></div> +<div class="sourceCode" id="cb192"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb192-1" data-line-number="1"><span class="kw">head</span>(molecules[ , <span class="dv">1</span><span class="op">:</span><span class="dv">3</span>])</a></code></pre></div> <pre><code>## NA19098.r1.A01 NA19098.r1.A02 NA19098.r1.A03 ## ENSG00000237683 0 0 0 ## ENSG00000187634 0 0 0 @@ -556,7 +556,7 @@ working directory. These files are the copies of the original files made on the ## ENSG00000187961 0 0 0 ## ENSG00000187583 0 0 0 ## ENSG00000187642 0 0 0</code></pre> -<div class="sourceCode" id="cb186"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb186-1" data-line-number="1"><span class="kw">head</span>(anno)</a></code></pre></div> +<div class="sourceCode" id="cb194"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb194-1" data-line-number="1"><span class="kw">head</span>(anno)</a></code></pre></div> <pre><code>## individual replicate well batch sample_id ## 1 NA19098 r1 A01 NA19098.r1 NA19098.r1.A01 ## 2 NA19098 r1 A02 NA19098.r1 NA19098.r1.A02 @@ -567,31 +567,31 @@ working directory. These files are the copies of the original files made on the <p>The data consists of 3 individuals and <code>r length(unique(anno$replicate))</code> replicates and therefore has <code>r length(unique(anno$batch))</code> batches in total.</p> <p>We standardize the analysis by using both <code>SingleCellExperiment</code> (SCE) and <code>scater</code> packages. First, create the SCE object:</p> -<div class="sourceCode" id="cb188"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb188-1" data-line-number="1">umi <-<span class="st"> </span><span class="kw">SingleCellExperiment</span>(</a> -<a class="sourceLine" id="cb188-2" data-line-number="2"> <span class="dt">assays =</span> <span class="kw">list</span>(<span class="dt">counts =</span> <span class="kw">as.matrix</span>(molecules)), </a> -<a class="sourceLine" id="cb188-3" data-line-number="3"> <span class="dt">colData =</span> anno</a> -<a class="sourceLine" id="cb188-4" data-line-number="4">)</a></code></pre></div> +<div class="sourceCode" id="cb196"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb196-1" data-line-number="1">umi <-<span class="st"> </span><span class="kw">SingleCellExperiment</span>(</a> +<a class="sourceLine" id="cb196-2" data-line-number="2"> <span class="dt">assays =</span> <span class="kw">list</span>(<span class="dt">counts =</span> <span class="kw">as.matrix</span>(molecules)), </a> +<a class="sourceLine" id="cb196-3" data-line-number="3"> <span class="dt">colData =</span> anno</a> +<a class="sourceLine" id="cb196-4" data-line-number="4">)</a></code></pre></div> <p>Remove genes that are not expressed in any cell:</p> -<div class="sourceCode" id="cb189"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb189-1" data-line-number="1">keep_feature <-<span class="st"> </span><span class="kw">rowSums</span>(<span class="kw">counts</span>(umi) <span class="op">></span><span class="st"> </span><span class="dv">0</span>) <span class="op">></span><span class="st"> </span><span class="dv">0</span></a> -<a class="sourceLine" id="cb189-2" data-line-number="2">umi <-<span class="st"> </span>umi[keep_feature, ]</a></code></pre></div> +<div class="sourceCode" id="cb197"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb197-1" data-line-number="1">keep_feature <-<span class="st"> </span><span class="kw">rowSums</span>(<span class="kw">counts</span>(umi) <span class="op">></span><span class="st"> </span><span class="dv">0</span>) <span class="op">></span><span class="st"> </span><span class="dv">0</span></a> +<a class="sourceLine" id="cb197-2" data-line-number="2">umi <-<span class="st"> </span>umi[keep_feature, ]</a></code></pre></div> <p>Define control features (genes) - ERCC spike-ins and mitochondrial genes (<a href="http://jdblischak.github.io/singleCellSeq/analysis/qc-filter-ipsc.html">provided</a> by the authors):</p> -<div class="sourceCode" id="cb190"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb190-1" data-line-number="1"><span class="kw">isSpike</span>(umi, <span class="st">"ERCC"</span>) <-<span class="st"> </span><span class="kw">grepl</span>(<span class="st">"^ERCC-"</span>, <span class="kw">rownames</span>(umi))</a> -<a class="sourceLine" id="cb190-2" data-line-number="2"><span class="kw">isSpike</span>(umi, <span class="st">"MT"</span>) <-<span class="st"> </span><span class="kw">rownames</span>(umi) <span class="op">%in%</span><span class="st"> </span></a> -<a class="sourceLine" id="cb190-3" data-line-number="3"><span class="st"> </span><span class="kw">c</span>(<span class="st">"ENSG00000198899"</span>, <span class="st">"ENSG00000198727"</span>, <span class="st">"ENSG00000198888"</span>,</a> -<a class="sourceLine" id="cb190-4" data-line-number="4"> <span class="st">"ENSG00000198886"</span>, <span class="st">"ENSG00000212907"</span>, <span class="st">"ENSG00000198786"</span>,</a> -<a class="sourceLine" id="cb190-5" data-line-number="5"> <span class="st">"ENSG00000198695"</span>, <span class="st">"ENSG00000198712"</span>, <span class="st">"ENSG00000198804"</span>,</a> -<a class="sourceLine" id="cb190-6" data-line-number="6"> <span class="st">"ENSG00000198763"</span>, <span class="st">"ENSG00000228253"</span>, <span class="st">"ENSG00000198938"</span>,</a> -<a class="sourceLine" id="cb190-7" data-line-number="7"> <span class="st">"ENSG00000198840"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb198"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb198-1" data-line-number="1"><span class="kw">isSpike</span>(umi, <span class="st">"ERCC"</span>) <-<span class="st"> </span><span class="kw">grepl</span>(<span class="st">"^ERCC-"</span>, <span class="kw">rownames</span>(umi))</a> +<a class="sourceLine" id="cb198-2" data-line-number="2"><span class="kw">isSpike</span>(umi, <span class="st">"MT"</span>) <-<span class="st"> </span><span class="kw">rownames</span>(umi) <span class="op">%in%</span><span class="st"> </span></a> +<a class="sourceLine" id="cb198-3" data-line-number="3"><span class="st"> </span><span class="kw">c</span>(<span class="st">"ENSG00000198899"</span>, <span class="st">"ENSG00000198727"</span>, <span class="st">"ENSG00000198888"</span>,</a> +<a class="sourceLine" id="cb198-4" data-line-number="4"> <span class="st">"ENSG00000198886"</span>, <span class="st">"ENSG00000212907"</span>, <span class="st">"ENSG00000198786"</span>,</a> +<a class="sourceLine" id="cb198-5" data-line-number="5"> <span class="st">"ENSG00000198695"</span>, <span class="st">"ENSG00000198712"</span>, <span class="st">"ENSG00000198804"</span>,</a> +<a class="sourceLine" id="cb198-6" data-line-number="6"> <span class="st">"ENSG00000198763"</span>, <span class="st">"ENSG00000228253"</span>, <span class="st">"ENSG00000198938"</span>,</a> +<a class="sourceLine" id="cb198-7" data-line-number="7"> <span class="st">"ENSG00000198840"</span>)</a></code></pre></div> <p>Calculate the quality metrics:</p> -<div class="sourceCode" id="cb191"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb191-1" data-line-number="1">umi <-<span class="st"> </span><span class="kw">calculateQCMetrics</span>(</a> -<a class="sourceLine" id="cb191-2" data-line-number="2"> umi,</a> -<a class="sourceLine" id="cb191-3" data-line-number="3"> <span class="dt">feature_controls =</span> <span class="kw">list</span>(</a> -<a class="sourceLine" id="cb191-4" data-line-number="4"> <span class="dt">ERCC =</span> <span class="kw">isSpike</span>(umi, <span class="st">"ERCC"</span>), </a> -<a class="sourceLine" id="cb191-5" data-line-number="5"> <span class="dt">MT =</span> <span class="kw">isSpike</span>(umi, <span class="st">"MT"</span>)</a> -<a class="sourceLine" id="cb191-6" data-line-number="6"> )</a> -<a class="sourceLine" id="cb191-7" data-line-number="7">)</a></code></pre></div> +<div class="sourceCode" id="cb199"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb199-1" data-line-number="1">umi <-<span class="st"> </span><span class="kw">calculateQCMetrics</span>(</a> +<a class="sourceLine" id="cb199-2" data-line-number="2"> umi,</a> +<a class="sourceLine" id="cb199-3" data-line-number="3"> <span class="dt">feature_controls =</span> <span class="kw">list</span>(</a> +<a class="sourceLine" id="cb199-4" data-line-number="4"> <span class="dt">ERCC =</span> <span class="kw">isSpike</span>(umi, <span class="st">"ERCC"</span>), </a> +<a class="sourceLine" id="cb199-5" data-line-number="5"> <span class="dt">MT =</span> <span class="kw">isSpike</span>(umi, <span class="st">"MT"</span>)</a> +<a class="sourceLine" id="cb199-6" data-line-number="6"> )</a> +<a class="sourceLine" id="cb199-7" data-line-number="7">)</a></code></pre></div> <pre><code>## Warning in calculateQCMetrics(umi, feature_controls = list(ERCC = ## isSpike(umi, : spike-in set 'ERCC' overwritten by feature_controls set of ## the same name</code></pre> @@ -605,11 +605,11 @@ by the authors):</p> were using read counts rather than UMI counts this would be the total number of reads). Wells with few reads/molecules are likely to have been broken or failed to capture a cell, and should thus be removed.</p> -<div class="sourceCode" id="cb193"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb193-1" data-line-number="1"><span class="kw">hist</span>(</a> -<a class="sourceLine" id="cb193-2" data-line-number="2"> umi<span class="op">$</span>total_counts,</a> -<a class="sourceLine" id="cb193-3" data-line-number="3"> <span class="dt">breaks =</span> <span class="dv">100</span></a> -<a class="sourceLine" id="cb193-4" data-line-number="4">)</a> -<a class="sourceLine" id="cb193-5" data-line-number="5"><span class="kw">abline</span>(<span class="dt">v =</span> <span class="dv">25000</span>, <span class="dt">col =</span> <span class="st">"red"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb201"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb201-1" data-line-number="1"><span class="kw">hist</span>(</a> +<a class="sourceLine" id="cb201-2" data-line-number="2"> umi<span class="op">$</span>total_counts,</a> +<a class="sourceLine" id="cb201-3" data-line-number="3"> <span class="dt">breaks =</span> <span class="dv">100</span></a> +<a class="sourceLine" id="cb201-4" data-line-number="4">)</a> +<a class="sourceLine" id="cb201-5" data-line-number="5"><span class="kw">abline</span>(<span class="dt">v =</span> <span class="dv">25000</span>, <span class="dt">col =</span> <span class="st">"red"</span>)</a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:total-counts-hist"></span> <img src="exprs-qc_files/figure-html/total-counts-hist-1.png" alt="Histogram of library sizes for all cells" width="90%" /> <p class="caption"> @@ -630,11 +630,11 @@ total number of molecules for each cell should follow?</p></li> <div id="detected-genes" class="section level3"> <h3><span class="header-section-number">6.2.2</span> Detected genes</h3> <p>In addition to ensuring sufficient sequencing depth for each sample, we also want to make sure that the reads are distributed across the transcriptome. Thus, we count the total number of unique genes detected in each sample.</p> -<div class="sourceCode" id="cb195"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb195-1" data-line-number="1"><span class="kw">hist</span>(</a> -<a class="sourceLine" id="cb195-2" data-line-number="2"> umi<span class="op">$</span>total_features_by_counts,</a> -<a class="sourceLine" id="cb195-3" data-line-number="3"> <span class="dt">breaks =</span> <span class="dv">100</span></a> -<a class="sourceLine" id="cb195-4" data-line-number="4">)</a> -<a class="sourceLine" id="cb195-5" data-line-number="5"><span class="kw">abline</span>(<span class="dt">v =</span> <span class="dv">7000</span>, <span class="dt">col =</span> <span class="st">"red"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb203"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb203-1" data-line-number="1"><span class="kw">hist</span>(</a> +<a class="sourceLine" id="cb203-2" data-line-number="2"> umi<span class="op">$</span>total_features_by_counts,</a> +<a class="sourceLine" id="cb203-3" data-line-number="3"> <span class="dt">breaks =</span> <span class="dv">100</span></a> +<a class="sourceLine" id="cb203-4" data-line-number="4">)</a> +<a class="sourceLine" id="cb203-5" data-line-number="5"><span class="kw">abline</span>(<span class="dt">v =</span> <span class="dv">7000</span>, <span class="dt">col =</span> <span class="st">"red"</span>)</a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:total-features-hist"></span> <img src="exprs-qc_files/figure-html/total-features-hist-1.png" alt="Histogram of the number of detected genes in all cells" width="90%" /> <p class="caption"> @@ -662,24 +662,24 @@ RNAs and endogenous RNAs. This ratio can be used to estimate the total amount of RNA in the captured cells. Cells with a high level of <em>spike-in</em> RNAs had low starting amounts of RNA, likely due to the cell being dead or stressed which may result in the RNA being degraded.</p> -<div class="sourceCode" id="cb197"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb197-1" data-line-number="1"><span class="kw">plotColData</span>(</a> -<a class="sourceLine" id="cb197-2" data-line-number="2"> umi,</a> -<a class="sourceLine" id="cb197-3" data-line-number="3"> <span class="dt">x =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb197-4" data-line-number="4"> <span class="dt">y =</span> <span class="st">"pct_counts_MT"</span>,</a> -<a class="sourceLine" id="cb197-5" data-line-number="5"> <span class="dt">colour =</span> <span class="st">"batch"</span></a> -<a class="sourceLine" id="cb197-6" data-line-number="6">)</a></code></pre></div> +<div class="sourceCode" id="cb205"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb205-1" data-line-number="1"><span class="kw">plotColData</span>(</a> +<a class="sourceLine" id="cb205-2" data-line-number="2"> umi,</a> +<a class="sourceLine" id="cb205-3" data-line-number="3"> <span class="dt">x =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb205-4" data-line-number="4"> <span class="dt">y =</span> <span class="st">"pct_counts_MT"</span>,</a> +<a class="sourceLine" id="cb205-5" data-line-number="5"> <span class="dt">colour =</span> <span class="st">"batch"</span></a> +<a class="sourceLine" id="cb205-6" data-line-number="6">)</a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:mt-vs-counts"></span> <img src="exprs-qc_files/figure-html/mt-vs-counts-1.png" alt="Percentage of counts in MT genes" width="90%" /> <p class="caption"> Figure 6.3: Percentage of counts in MT genes </p> </div> -<div class="sourceCode" id="cb198"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb198-1" data-line-number="1"><span class="kw">plotColData</span>(</a> -<a class="sourceLine" id="cb198-2" data-line-number="2"> umi,</a> -<a class="sourceLine" id="cb198-3" data-line-number="3"> <span class="dt">x =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb198-4" data-line-number="4"> <span class="dt">y =</span> <span class="st">"pct_counts_ERCC"</span>,</a> -<a class="sourceLine" id="cb198-5" data-line-number="5"> <span class="dt">colour =</span> <span class="st">"batch"</span></a> -<a class="sourceLine" id="cb198-6" data-line-number="6">)</a></code></pre></div> +<div class="sourceCode" id="cb206"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb206-1" data-line-number="1"><span class="kw">plotColData</span>(</a> +<a class="sourceLine" id="cb206-2" data-line-number="2"> umi,</a> +<a class="sourceLine" id="cb206-3" data-line-number="3"> <span class="dt">x =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb206-4" data-line-number="4"> <span class="dt">y =</span> <span class="st">"pct_counts_ERCC"</span>,</a> +<a class="sourceLine" id="cb206-5" data-line-number="5"> <span class="dt">colour =</span> <span class="st">"batch"</span></a> +<a class="sourceLine" id="cb206-6" data-line-number="6">)</a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:ercc-vs-counts"></span> <img src="exprs-qc_files/figure-html/ercc-vs-counts-1.png" alt="Percentage of counts in ERCCs" width="90%" /> <p class="caption"> @@ -706,17 +706,17 @@ Figure 6.4: Percentage of counts in ERCCs <div id="manual" class="section level4"> <h4><span class="header-section-number">6.2.4.1</span> Manual</h4> <p>Now we can define a cell filter based on our previous analysis:</p> -<div class="sourceCode" id="cb201"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb201-1" data-line-number="1">umi<span class="op">$</span>use <-<span class="st"> </span>(</a> -<a class="sourceLine" id="cb201-2" data-line-number="2"> <span class="co"># sufficient features (genes)</span></a> -<a class="sourceLine" id="cb201-3" data-line-number="3"> filter_by_expr_features <span class="op">&</span></a> -<a class="sourceLine" id="cb201-4" data-line-number="4"><span class="st"> </span><span class="co"># sufficient molecules counted</span></a> -<a class="sourceLine" id="cb201-5" data-line-number="5"><span class="st"> </span>filter_by_total_counts <span class="op">&</span></a> -<a class="sourceLine" id="cb201-6" data-line-number="6"><span class="st"> </span><span class="co"># sufficient endogenous RNA</span></a> -<a class="sourceLine" id="cb201-7" data-line-number="7"><span class="st"> </span>filter_by_ERCC <span class="op">&</span></a> -<a class="sourceLine" id="cb201-8" data-line-number="8"><span class="st"> </span><span class="co"># remove cells with unusual number of reads in MT genes</span></a> -<a class="sourceLine" id="cb201-9" data-line-number="9"><span class="st"> </span>filter_by_MT</a> -<a class="sourceLine" id="cb201-10" data-line-number="10">)</a></code></pre></div> -<div class="sourceCode" id="cb202"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb202-1" data-line-number="1"><span class="kw">table</span>(umi<span class="op">$</span>use)</a></code></pre></div> +<div class="sourceCode" id="cb209"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb209-1" data-line-number="1">umi<span class="op">$</span>use <-<span class="st"> </span>(</a> +<a class="sourceLine" id="cb209-2" data-line-number="2"> <span class="co"># sufficient features (genes)</span></a> +<a class="sourceLine" id="cb209-3" data-line-number="3"> filter_by_expr_features <span class="op">&</span></a> +<a class="sourceLine" id="cb209-4" data-line-number="4"><span class="st"> </span><span class="co"># sufficient molecules counted</span></a> +<a class="sourceLine" id="cb209-5" data-line-number="5"><span class="st"> </span>filter_by_total_counts <span class="op">&</span></a> +<a class="sourceLine" id="cb209-6" data-line-number="6"><span class="st"> </span><span class="co"># sufficient endogenous RNA</span></a> +<a class="sourceLine" id="cb209-7" data-line-number="7"><span class="st"> </span>filter_by_ERCC <span class="op">&</span></a> +<a class="sourceLine" id="cb209-8" data-line-number="8"><span class="st"> </span><span class="co"># remove cells with unusual number of reads in MT genes</span></a> +<a class="sourceLine" id="cb209-9" data-line-number="9"><span class="st"> </span>filter_by_MT</a> +<a class="sourceLine" id="cb209-10" data-line-number="10">)</a></code></pre></div> +<div class="sourceCode" id="cb210"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb210-1" data-line-number="1"><span class="kw">table</span>(umi<span class="op">$</span>use)</a></code></pre></div> <pre><code>## ## FALSE TRUE ## 207 657</code></pre> @@ -741,31 +741,31 @@ by using the <code>mvoutlier</code> package on the QC metrics for all cells. Thi identify cells that have substantially different QC metrics from the others, possibly corresponding to low-quality cells. We can visualize any outliers using a principal components plot as shown below:</p> -<div class="sourceCode" id="cb204"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb204-1" data-line-number="1">umi <-<span class="st"> </span><span class="kw">runPCA</span>(</a> -<a class="sourceLine" id="cb204-2" data-line-number="2"> umi, </a> -<a class="sourceLine" id="cb204-3" data-line-number="3"> <span class="dt">use_coldata =</span> <span class="ot">TRUE</span>, </a> -<a class="sourceLine" id="cb204-4" data-line-number="4"> <span class="dt">detect_outliers =</span> <span class="ot">TRUE</span></a> -<a class="sourceLine" id="cb204-5" data-line-number="5">)</a> -<a class="sourceLine" id="cb204-6" data-line-number="6"><span class="kw">reducedDimNames</span>(umi)</a></code></pre></div> +<div class="sourceCode" id="cb212"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb212-1" data-line-number="1">umi <-<span class="st"> </span><span class="kw">runPCA</span>(</a> +<a class="sourceLine" id="cb212-2" data-line-number="2"> umi, </a> +<a class="sourceLine" id="cb212-3" data-line-number="3"> <span class="dt">use_coldata =</span> <span class="ot">TRUE</span>, </a> +<a class="sourceLine" id="cb212-4" data-line-number="4"> <span class="dt">detect_outliers =</span> <span class="ot">TRUE</span></a> +<a class="sourceLine" id="cb212-5" data-line-number="5">)</a> +<a class="sourceLine" id="cb212-6" data-line-number="6"><span class="kw">reducedDimNames</span>(umi)</a></code></pre></div> <pre><code>## [1] "PCA_coldata"</code></pre> <p>Column subsetting can then be performed based on the <code>$outlier</code> slot, which indicates whether or not each cell has been designated as an outlier. Automatic outlier detection can be informative, but a close inspection of QC metrics and tailored filtering for the specifics of the dataset at hand is strongly recommended.</p> -<div class="sourceCode" id="cb206"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb206-1" data-line-number="1"><span class="kw">table</span>(umi<span class="op">$</span>outlier)</a></code></pre></div> +<div class="sourceCode" id="cb214"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb214-1" data-line-number="1"><span class="kw">table</span>(umi<span class="op">$</span>outlier)</a></code></pre></div> <pre><code>## ## FALSE TRUE ## 791 73</code></pre> <p>Then, we can use a PCA plot to see a 2D representation of the cells ordered by their quality metrics.</p> -<div class="sourceCode" id="cb208"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb208-1" data-line-number="1"><span class="kw">plotReducedDim</span>(</a> -<a class="sourceLine" id="cb208-2" data-line-number="2"> umi,</a> -<a class="sourceLine" id="cb208-3" data-line-number="3"> <span class="dt">use_dimred =</span> <span class="st">"PCA_coldata"</span>,</a> -<a class="sourceLine" id="cb208-4" data-line-number="4"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>, </a> -<a class="sourceLine" id="cb208-5" data-line-number="5"> <span class="dt">shape_by =</span> <span class="st">"use"</span>, </a> -<a class="sourceLine" id="cb208-6" data-line-number="6"> <span class="dt">colour_by =</span> <span class="st">"outlier"</span></a> -<a class="sourceLine" id="cb208-7" data-line-number="7">)</a></code></pre></div> +<div class="sourceCode" id="cb216"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb216-1" data-line-number="1"><span class="kw">plotReducedDim</span>(</a> +<a class="sourceLine" id="cb216-2" data-line-number="2"> umi,</a> +<a class="sourceLine" id="cb216-3" data-line-number="3"> <span class="dt">use_dimred =</span> <span class="st">"PCA_coldata"</span>,</a> +<a class="sourceLine" id="cb216-4" data-line-number="4"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>, </a> +<a class="sourceLine" id="cb216-5" data-line-number="5"> <span class="dt">shape_by =</span> <span class="st">"use"</span>, </a> +<a class="sourceLine" id="cb216-6" data-line-number="6"> <span class="dt">colour_by =</span> <span class="st">"outlier"</span></a> +<a class="sourceLine" id="cb216-7" data-line-number="7">)</a></code></pre></div> <p><img src="exprs-qc_files/figure-html/unnamed-chunk-14-1.png" width="90%" style="display: block; margin: auto;" /></p> </div> </div> @@ -775,6 +775,18 @@ their quality metrics.</p> <p>Compare the default, automatic and manual cell filters. Plot a Venn diagram of the outlier cells from these filterings.</p> <p><strong>Hint</strong>: Use <code>vennCounts</code> and <code>vennDiagram</code> functions from the <a href="https://bioconductor.org/packages/release/bioc/html/limma.html">limma</a> package to make a Venn diagram.</p> <p><strong>Answer</strong></p> +<div class="sourceCode" id="cb217"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb217-1" data-line-number="1"><span class="kw">library</span>(limma)</a> +<a class="sourceLine" id="cb217-2" data-line-number="2">auto <-<span class="st"> </span><span class="kw">colnames</span>(umi)[umi<span class="op">$</span>outlier]</a> +<a class="sourceLine" id="cb217-3" data-line-number="3">man <-<span class="st"> </span><span class="kw">colnames</span>(umi)[<span class="op">!</span>umi<span class="op">$</span>use]</a> +<a class="sourceLine" id="cb217-4" data-line-number="4">venn.diag <-<span class="st"> </span><span class="kw">vennCounts</span>(</a> +<a class="sourceLine" id="cb217-5" data-line-number="5"> <span class="kw">cbind</span>(<span class="kw">colnames</span>(umi) <span class="op">%in%</span><span class="st"> </span>auto,</a> +<a class="sourceLine" id="cb217-6" data-line-number="6"> <span class="kw">colnames</span>(umi) <span class="op">%in%</span><span class="st"> </span>man)</a> +<a class="sourceLine" id="cb217-7" data-line-number="7">)</a> +<a class="sourceLine" id="cb217-8" data-line-number="8"><span class="kw">vennDiagram</span>(</a> +<a class="sourceLine" id="cb217-9" data-line-number="9"> venn.diag,</a> +<a class="sourceLine" id="cb217-10" data-line-number="10"> <span class="dt">names =</span> <span class="kw">c</span>(<span class="st">"Automatic"</span>, <span class="st">"Manual"</span>),</a> +<a class="sourceLine" id="cb217-11" data-line-number="11"> <span class="dt">circle.col =</span> <span class="kw">c</span>(<span class="st">"blue"</span>, <span class="st">"green"</span>)</a> +<a class="sourceLine" id="cb217-12" data-line-number="12">)</a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:cell-filt-comp"></span> <img src="exprs-qc_files/figure-html/cell-filt-comp-1.png" alt="Comparison of the default, automatic and manual cell filters" width="90%" /> <p class="caption"> @@ -790,13 +802,13 @@ one droplet resulting one cell barcode actually containing read information from multiple cells. One way to find doublets/multiplets in the data is to see if there are cells co-expressing markers of distinct cell types. There are also computational tools available for detecting potential doublets in the cells. A -lot of these tools rely on artifical doublets formed from the datasets by +lot of these tools rely on artificial doublets formed from the datasets by randomly joining the expression profiles of two cells. Then the cells are tested against the artificial doublet profiles.</p> <p>We demonstrate the usage of two of these doublet detection tools.</p> <div id="scds" class="section level3"> <h3><span class="header-section-number">6.3.1</span> scds</h3> -<p><code>scds</code> has two detection methods:</p> +<p><code>scds</code><span class="citation">(<span class="citeproc-not-found" data-reference-id="Bais2019-hf"><strong>???</strong></span>)</span> has two detection methods:</p> <ol style="list-style-type: decimal"> <li>co-expression based;</li> <li>binary-classification based.</li> @@ -806,55 +818,76 @@ estimated based on a binomial model and gene pairs that do not co-expression often get higher scores when they co-expression in some cells. The cells’ doublet scores are derived based on the co-expression of pairs of genes. In the binary classification based approach, artificial doublet clusters are generated -and cells are difficult to separte from the artificial doublets get higher +and cells are difficult to separate from the artificial doublets get higher doublet scores.</p> -<div class="sourceCode" id="cb209"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb209-1" data-line-number="1"><span class="kw">library</span>(scds)</a> -<a class="sourceLine" id="cb209-2" data-line-number="2"><span class="co">#- Annotate doublet using co-expression based doublet scoring:</span></a> -<a class="sourceLine" id="cb209-3" data-line-number="3">umi =<span class="st"> </span><span class="kw">cxds</span>(umi)</a> -<a class="sourceLine" id="cb209-4" data-line-number="4"></a> -<a class="sourceLine" id="cb209-5" data-line-number="5"><span class="co">#- Annotate doublet using binary classification based doublet scoring:</span></a> -<a class="sourceLine" id="cb209-6" data-line-number="6">umi =<span class="st"> </span><span class="kw">bcds</span>(umi)</a></code></pre></div> -<pre><code>## [1] train-error:0.065830+0.003564 test-error:0.099490+0.021301 +<div class="sourceCode" id="cb218"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb218-1" data-line-number="1"><span class="kw">library</span>(scds)</a> +<a class="sourceLine" id="cb218-2" data-line-number="2"><span class="co">#- Annotate doublet using co-expression based doublet scoring:</span></a> +<a class="sourceLine" id="cb218-3" data-line-number="3">umi =<span class="st"> </span><span class="kw">cxds</span>(umi)</a> +<a class="sourceLine" id="cb218-4" data-line-number="4"></a> +<a class="sourceLine" id="cb218-5" data-line-number="5"><span class="co">#- Annotate doublet using binary classification based doublet scoring:</span></a> +<a class="sourceLine" id="cb218-6" data-line-number="6">umi =<span class="st"> </span><span class="kw">bcds</span>(umi)</a></code></pre></div> +<pre><code>## [1] train-error:0.056712+0.006782 test-error:0.090820+0.022608 ## Multiple eval metrics are present. Will use test_error for early stopping. ## Will train until test_error hasn't improved in 2 rounds. ## -## [2] train-error:0.050057+0.006014 test-error:0.079285+0.013055 -## [3] train-error:0.039643+0.003520 test-error:0.071160+0.011813 -## [4] train-error:0.033855+0.005592 test-error:0.065367+0.015263 -## [5] train-error:0.029079+0.005075 test-error:0.065377+0.010823 -## [6] train-error:0.026621+0.005039 test-error:0.061324+0.010783 -## [7] train-error:0.019819+0.003215 test-error:0.054953+0.011878 -## [8] train-error:0.018662+0.003859 test-error:0.054358+0.014838 -## [9] train-error:0.016493+0.002438 test-error:0.058421+0.010390 -## [10] train-error:0.014901+0.004077 test-error:0.056687+0.009949 +## [2] train-error:0.042102+0.002537 test-error:0.084458+0.011641 +## [3] train-error:0.031539+0.002448 test-error:0.071155+0.009566 +## [4] train-error:0.029224+0.001912 test-error:0.072279+0.017508 +## [5] train-error:0.024595+0.002624 test-error:0.066512+0.016282 +## [6] train-error:0.021412+0.001913 test-error:0.063073+0.009557 +## [7] train-error:0.018373+0.002762 test-error:0.056687+0.016847 +## [8] train-error:0.016636+0.004358 test-error:0.052079+0.011572 +## [9] train-error:0.014466+0.002777 test-error:0.051499+0.008444 +## [10] train-error:0.012731+0.001173 test-error:0.048021+0.010077 +## [11] train-error:0.012586+0.001800 test-error:0.046292+0.011280 +## [12] train-error:0.009692+0.002442 test-error:0.045707+0.009178 +## [13] train-error:0.007957+0.002586 test-error:0.043398+0.007749 +## [14] train-error:0.007378+0.002521 test-error:0.043393+0.009114 +## [15] train-error:0.007668+0.002402 test-error:0.043398+0.008171 +## [16] train-error:0.006944+0.002024 test-error:0.041084+0.009753 +## [17] train-error:0.004919+0.002115 test-error:0.038186+0.008825 +## [18] train-error:0.004774+0.002024 test-error:0.038761+0.008839 +## [19] train-error:0.003906+0.001863 test-error:0.037021+0.008974 +## [20] train-error:0.003038+0.001674 test-error:0.036447+0.008460 +## [21] train-error:0.002604+0.001084 test-error:0.037606+0.009120 +## [22] train-error:0.002604+0.000982 test-error:0.038181+0.010044 ## Stopping. Best iteration: -## [8] train-error:0.018662+0.003859 test-error:0.054358+0.014838 +## [20] train-error:0.003038+0.001674 test-error:0.036447+0.008460 ## -## [1] train-error:0.061921 +## [1] train-error:0.065972 ## Will train until train_error hasn't improved in 2 rounds. ## -## [2] train-error:0.052083 -## [3] train-error:0.039352 -## [4] train-error:0.031829</code></pre> -<div class="sourceCode" id="cb211"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb211-1" data-line-number="1"><span class="co">#- Combine both annotations into a hybrid annotation</span></a> -<a class="sourceLine" id="cb211-2" data-line-number="2">umi =<span class="st"> </span><span class="kw">cxds_bcds_hybrid</span>(umi)</a> -<a class="sourceLine" id="cb211-3" data-line-number="3"></a> -<a class="sourceLine" id="cb211-4" data-line-number="4"><span class="co">#- Doublet scores are now available via colData:</span></a> -<a class="sourceLine" id="cb211-5" data-line-number="5">CD =<span class="st"> </span><span class="kw">colData</span>(umi)</a> -<a class="sourceLine" id="cb211-6" data-line-number="6"><span class="kw">head</span>(<span class="kw">cbind</span>(CD<span class="op">$</span>cxds_score,CD<span class="op">$</span>bcds_score, CD<span class="op">$</span>hybrid_score))</a></code></pre></div> -<pre><code>## [,1] [,2] [,3] -## NA19098.r1.A01 4131.405 0.05192234 0.2552833 -## NA19098.r1.A02 4564.089 0.03846648 0.2656644 -## NA19098.r1.A03 2827.904 0.03932181 0.1647904 -## NA19098.r1.A04 4708.213 0.04480528 0.2811814 -## NA19098.r1.A05 6134.590 0.03854402 0.3578605 -## NA19098.r1.A06 5810.730 0.03731131 0.3374924</code></pre> -<div class="sourceCode" id="cb213"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb213-1" data-line-number="1"><span class="kw">plotColData</span>(</a> -<a class="sourceLine" id="cb213-2" data-line-number="2"> umi,</a> -<a class="sourceLine" id="cb213-3" data-line-number="3"> <span class="dt">x =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb213-4" data-line-number="4"> <span class="dt">y =</span> <span class="st">"pct_counts_ERCC"</span>,</a> -<a class="sourceLine" id="cb213-5" data-line-number="5"> <span class="dt">colour =</span> <span class="st">"hybrid_score"</span></a> -<a class="sourceLine" id="cb213-6" data-line-number="6">)</a></code></pre></div> +## [2] train-error:0.046875 +## [3] train-error:0.030671 +## [4] train-error:0.028356 +## [5] train-error:0.022569 +## [6] train-error:0.021412 +## [7] train-error:0.019676 +## [8] train-error:0.018519 +## [9] train-error:0.016204 +## [10] train-error:0.013310 +## [11] train-error:0.011574 +## [12] train-error:0.009838 +## [13] train-error:0.008102</code></pre> +<div class="sourceCode" id="cb220"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb220-1" data-line-number="1"><span class="co">#- Combine both annotations into a hybrid annotation</span></a> +<a class="sourceLine" id="cb220-2" data-line-number="2">umi =<span class="st"> </span><span class="kw">cxds_bcds_hybrid</span>(umi)</a> +<a class="sourceLine" id="cb220-3" data-line-number="3"></a> +<a class="sourceLine" id="cb220-4" data-line-number="4"><span class="co">#- Doublet scores are now available via colData:</span></a> +<a class="sourceLine" id="cb220-5" data-line-number="5">CD =<span class="st"> </span><span class="kw">colData</span>(umi)</a> +<a class="sourceLine" id="cb220-6" data-line-number="6"><span class="kw">head</span>(<span class="kw">cbind</span>(CD<span class="op">$</span>cxds_score,CD<span class="op">$</span>bcds_score, CD<span class="op">$</span>hybrid_score))</a></code></pre></div> +<pre><code>## [,1] [,2] [,3] +## NA19098.r1.A01 4131.405 0.013268524 0.2493021 +## NA19098.r1.A02 4564.089 0.006372486 0.2676119 +## NA19098.r1.A03 2827.904 0.002598290 0.1619169 +## NA19098.r1.A04 4708.213 0.013077467 0.2829361 +## NA19098.r1.A05 6134.590 0.005533409 0.3588618 +## NA19098.r1.A06 5810.730 0.006969100 0.3413388</code></pre> +<div class="sourceCode" id="cb222"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb222-1" data-line-number="1"><span class="kw">plotColData</span>(</a> +<a class="sourceLine" id="cb222-2" data-line-number="2"> umi,</a> +<a class="sourceLine" id="cb222-3" data-line-number="3"> <span class="dt">x =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb222-4" data-line-number="4"> <span class="dt">y =</span> <span class="st">"pct_counts_ERCC"</span>,</a> +<a class="sourceLine" id="cb222-5" data-line-number="5"> <span class="dt">colour =</span> <span class="st">"hybrid_score"</span></a> +<a class="sourceLine" id="cb222-6" data-line-number="6">)</a></code></pre></div> <p><img src="exprs-qc_files/figure-html/unnamed-chunk-15-1.png" width="90%" style="display: block; margin: auto;" /></p> <p>The <a href="https://doi.org/10.1093/bioinformatics/btz698"><code>scds</code> paper</a> features excellent descriptions and evaluations of other currently-available doublet @@ -867,17 +900,17 @@ detection methods.</p> <pre><code>python run_doubletDetection.py </code></pre> <p>Here is the prediction results by <code>DoubletDetection</code>:</p> -<div class="sourceCode" id="cb215"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb215-1" data-line-number="1"><span class="kw">require</span>(UpSetR)</a></code></pre></div> +<div class="sourceCode" id="cb224"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb224-1" data-line-number="1"><span class="kw">require</span>(UpSetR)</a></code></pre></div> <pre><code>## Loading required package: UpSetR</code></pre> -<div class="sourceCode" id="cb217"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb217-1" data-line-number="1">pred_tung <-<span class="st"> </span><span class="kw">read.delim</span>(<span class="dt">file =</span> <span class="st">"data/doublets/tung.dbls.txt"</span>, <span class="dt">header =</span> <span class="ot">FALSE</span>)</a> -<a class="sourceLine" id="cb217-2" data-line-number="2"><span class="kw">dim</span>(pred_tung)</a></code></pre></div> +<div class="sourceCode" id="cb226"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb226-1" data-line-number="1">pred_tung <-<span class="st"> </span><span class="kw">read.delim</span>(<span class="dt">file =</span> <span class="st">"data/doublets/tung.dbls.txt"</span>, <span class="dt">header =</span> <span class="ot">FALSE</span>)</a> +<a class="sourceLine" id="cb226-2" data-line-number="2"><span class="kw">dim</span>(pred_tung)</a></code></pre></div> <pre><code>## [1] 864 1</code></pre> -<div class="sourceCode" id="cb219"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb219-1" data-line-number="1"><span class="kw">dim</span>(anno)</a></code></pre></div> +<div class="sourceCode" id="cb228"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb228-1" data-line-number="1"><span class="kw">dim</span>(anno)</a></code></pre></div> <pre><code>## [1] 864 5</code></pre> -<div class="sourceCode" id="cb221"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb221-1" data-line-number="1">umi<span class="op">$</span>dbd_dbl <-<span class="st"> </span><span class="kw">factor</span>(pred_tung<span class="op">$</span>V1)</a> -<a class="sourceLine" id="cb221-2" data-line-number="2"></a> -<a class="sourceLine" id="cb221-3" data-line-number="3">qc_label <-<span class="st"> </span><span class="kw">read.delim</span>(<span class="dt">file =</span> <span class="st">"data/qc_ipsc.txt"</span>)</a> -<a class="sourceLine" id="cb221-4" data-line-number="4"><span class="kw">head</span>(qc_label)</a></code></pre></div> +<div class="sourceCode" id="cb230"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb230-1" data-line-number="1">umi<span class="op">$</span>dbd_dbl <-<span class="st"> </span><span class="kw">factor</span>(pred_tung<span class="op">$</span>V1)</a> +<a class="sourceLine" id="cb230-2" data-line-number="2"></a> +<a class="sourceLine" id="cb230-3" data-line-number="3">qc_label <-<span class="st"> </span><span class="kw">read.delim</span>(<span class="dt">file =</span> <span class="st">"data/qc_ipsc.txt"</span>)</a> +<a class="sourceLine" id="cb230-4" data-line-number="4"><span class="kw">head</span>(qc_label)</a></code></pre></div> <pre><code>## individual replicate well cell_number concentration tra1.60 ## 1 NA19098 r1 A01 1 1.734785 1 ## 2 NA19098 r1 A02 1 1.723038 1 @@ -885,39 +918,39 @@ detection methods.</p> ## 4 NA19098 r1 A04 1 1.347492 1 ## 5 NA19098 r1 A05 1 2.313047 1 ## 6 NA19098 r1 A06 1 2.056803 1</code></pre> -<div class="sourceCode" id="cb223"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb223-1" data-line-number="1">qc_label<span class="op">$</span>sample_id <-<span class="st"> </span><span class="kw">paste0</span>(qc_label<span class="op">$</span>individual,<span class="st">"."</span>,qc_label<span class="op">$</span>replicate,<span class="st">"."</span>,qc_label<span class="op">$</span>well)</a> -<a class="sourceLine" id="cb223-2" data-line-number="2"><span class="kw">rownames</span>(qc_label) <-<span class="st"> </span>qc_label<span class="op">$</span>sample_id</a> -<a class="sourceLine" id="cb223-3" data-line-number="3"></a> -<a class="sourceLine" id="cb223-4" data-line-number="4">umi<span class="op">$</span>cell_number <-<span class="st"> </span><span class="kw">as.character</span>(qc_label[umi<span class="op">$</span>sample_id,<span class="st">"cell_number"</span>])</a> -<a class="sourceLine" id="cb223-5" data-line-number="5"></a> -<a class="sourceLine" id="cb223-6" data-line-number="6">umi<span class="op">$</span>cell_number[qc_label<span class="op">$</span>cell_number<span class="op">==</span><span class="dv">0</span>] <-<span class="st"> "no_cell"</span></a> -<a class="sourceLine" id="cb223-7" data-line-number="7">umi<span class="op">$</span>cell_number[qc_label<span class="op">$</span>cell_number <span class="op">==</span><span class="st"> </span><span class="dv">1</span>] <-<span class="st"> "single_cell"</span></a> -<a class="sourceLine" id="cb223-8" data-line-number="8">umi<span class="op">$</span>cell_number[qc_label<span class="op">$</span>cell_number<span class="op">></span><span class="dv">1</span>] <-<span class="st"> "multi_cell"</span></a> -<a class="sourceLine" id="cb223-9" data-line-number="9"></a> -<a class="sourceLine" id="cb223-10" data-line-number="10"><span class="kw">multiplot</span>(<span class="kw">plotColData</span>(</a> -<a class="sourceLine" id="cb223-11" data-line-number="11"> umi,</a> -<a class="sourceLine" id="cb223-12" data-line-number="12"> <span class="dt">x =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb223-13" data-line-number="13"> <span class="dt">y =</span> <span class="st">"pct_counts_ERCC"</span>,</a> -<a class="sourceLine" id="cb223-14" data-line-number="14"> <span class="dt">colour =</span> <span class="st">"hybrid_score"</span></a> -<a class="sourceLine" id="cb223-15" data-line-number="15">),</a> -<a class="sourceLine" id="cb223-16" data-line-number="16"><span class="kw">plotColData</span>(</a> -<a class="sourceLine" id="cb223-17" data-line-number="17"> umi,</a> -<a class="sourceLine" id="cb223-18" data-line-number="18"> <span class="dt">x =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb223-19" data-line-number="19"> <span class="dt">y =</span> <span class="st">"pct_counts_ERCC"</span>,</a> -<a class="sourceLine" id="cb223-20" data-line-number="20"> <span class="dt">colour =</span> <span class="st">"dbd_dbl"</span></a> -<a class="sourceLine" id="cb223-21" data-line-number="21">), <span class="kw">plotColData</span>(</a> -<a class="sourceLine" id="cb223-22" data-line-number="22"> umi,</a> -<a class="sourceLine" id="cb223-23" data-line-number="23"> <span class="dt">x =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb223-24" data-line-number="24"> <span class="dt">y =</span> <span class="st">"pct_counts_ERCC"</span>,</a> -<a class="sourceLine" id="cb223-25" data-line-number="25"> <span class="dt">colour =</span> <span class="st">"cell_number"</span></a> -<a class="sourceLine" id="cb223-26" data-line-number="26">),<span class="dt">cols =</span><span class="dv">2</span>)</a></code></pre></div> +<div class="sourceCode" id="cb232"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb232-1" data-line-number="1">qc_label<span class="op">$</span>sample_id <-<span class="st"> </span><span class="kw">paste0</span>(qc_label<span class="op">$</span>individual,<span class="st">"."</span>,qc_label<span class="op">$</span>replicate,<span class="st">"."</span>,qc_label<span class="op">$</span>well)</a> +<a class="sourceLine" id="cb232-2" data-line-number="2"><span class="kw">rownames</span>(qc_label) <-<span class="st"> </span>qc_label<span class="op">$</span>sample_id</a> +<a class="sourceLine" id="cb232-3" data-line-number="3"></a> +<a class="sourceLine" id="cb232-4" data-line-number="4">umi<span class="op">$</span>cell_number <-<span class="st"> </span><span class="kw">as.character</span>(qc_label[umi<span class="op">$</span>sample_id,<span class="st">"cell_number"</span>])</a> +<a class="sourceLine" id="cb232-5" data-line-number="5"></a> +<a class="sourceLine" id="cb232-6" data-line-number="6">umi<span class="op">$</span>cell_number[qc_label<span class="op">$</span>cell_number<span class="op">==</span><span class="dv">0</span>] <-<span class="st"> "no_cell"</span></a> +<a class="sourceLine" id="cb232-7" data-line-number="7">umi<span class="op">$</span>cell_number[qc_label<span class="op">$</span>cell_number <span class="op">==</span><span class="st"> </span><span class="dv">1</span>] <-<span class="st"> "single_cell"</span></a> +<a class="sourceLine" id="cb232-8" data-line-number="8">umi<span class="op">$</span>cell_number[qc_label<span class="op">$</span>cell_number<span class="op">></span><span class="dv">1</span>] <-<span class="st"> "multi_cell"</span></a> +<a class="sourceLine" id="cb232-9" data-line-number="9"></a> +<a class="sourceLine" id="cb232-10" data-line-number="10"><span class="kw">multiplot</span>(<span class="kw">plotColData</span>(</a> +<a class="sourceLine" id="cb232-11" data-line-number="11"> umi,</a> +<a class="sourceLine" id="cb232-12" data-line-number="12"> <span class="dt">x =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb232-13" data-line-number="13"> <span class="dt">y =</span> <span class="st">"pct_counts_ERCC"</span>,</a> +<a class="sourceLine" id="cb232-14" data-line-number="14"> <span class="dt">colour =</span> <span class="st">"hybrid_score"</span></a> +<a class="sourceLine" id="cb232-15" data-line-number="15">),</a> +<a class="sourceLine" id="cb232-16" data-line-number="16"><span class="kw">plotColData</span>(</a> +<a class="sourceLine" id="cb232-17" data-line-number="17"> umi,</a> +<a class="sourceLine" id="cb232-18" data-line-number="18"> <span class="dt">x =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb232-19" data-line-number="19"> <span class="dt">y =</span> <span class="st">"pct_counts_ERCC"</span>,</a> +<a class="sourceLine" id="cb232-20" data-line-number="20"> <span class="dt">colour =</span> <span class="st">"dbd_dbl"</span></a> +<a class="sourceLine" id="cb232-21" data-line-number="21">), <span class="kw">plotColData</span>(</a> +<a class="sourceLine" id="cb232-22" data-line-number="22"> umi,</a> +<a class="sourceLine" id="cb232-23" data-line-number="23"> <span class="dt">x =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb232-24" data-line-number="24"> <span class="dt">y =</span> <span class="st">"pct_counts_ERCC"</span>,</a> +<a class="sourceLine" id="cb232-25" data-line-number="25"> <span class="dt">colour =</span> <span class="st">"cell_number"</span></a> +<a class="sourceLine" id="cb232-26" data-line-number="26">),<span class="dt">cols =</span><span class="dv">2</span>)</a></code></pre></div> <p><img src="exprs-qc_files/figure-html/unnamed-chunk-16-1.png" width="90%" style="display: block; margin: auto;" /></p> -<div class="sourceCode" id="cb224"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb224-1" data-line-number="1">doublets <-<span class="st"> </span><span class="kw">unique</span>(umi<span class="op">$</span>sample_id[umi<span class="op">$</span>dbd_dbl <span class="op">==</span><span class="st">"1"</span>],</a> -<a class="sourceLine" id="cb224-2" data-line-number="2"> umi<span class="op">$</span>sample_id[umi<span class="op">$</span>hybrid_score <span class="op">></span><span class="st"> </span><span class="fl">0.8</span>])</a> -<a class="sourceLine" id="cb224-3" data-line-number="3"></a> -<a class="sourceLine" id="cb224-4" data-line-number="4"></a> -<a class="sourceLine" id="cb224-5" data-line-number="5">pl_list <-<span class="st"> </span>UpSetR<span class="op">::</span><span class="kw">fromList</span>(<span class="kw">list</span>(<span class="dt">pred =</span> doublets,<span class="dt">qc_label =</span> qc_label<span class="op">$</span>sample_id[qc_label<span class="op">$</span>cell_number <span class="op">></span><span class="dv">1</span>]))</a> -<a class="sourceLine" id="cb224-6" data-line-number="6">UpSetR<span class="op">::</span><span class="kw">upset</span>(pl_list,<span class="dt">sets =</span> <span class="kw">c</span>(<span class="st">"pred"</span>,<span class="st">"qc_label"</span>))</a></code></pre></div> +<div class="sourceCode" id="cb233"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb233-1" data-line-number="1">doublets <-<span class="st"> </span><span class="kw">unique</span>(umi<span class="op">$</span>sample_id[umi<span class="op">$</span>dbd_dbl <span class="op">==</span><span class="st">"1"</span>],</a> +<a class="sourceLine" id="cb233-2" data-line-number="2"> umi<span class="op">$</span>sample_id[umi<span class="op">$</span>hybrid_score <span class="op">></span><span class="st"> </span><span class="fl">0.8</span>])</a> +<a class="sourceLine" id="cb233-3" data-line-number="3"></a> +<a class="sourceLine" id="cb233-4" data-line-number="4"></a> +<a class="sourceLine" id="cb233-5" data-line-number="5">pl_list <-<span class="st"> </span>UpSetR<span class="op">::</span><span class="kw">fromList</span>(<span class="kw">list</span>(<span class="dt">pred =</span> doublets,<span class="dt">qc_label =</span> qc_label<span class="op">$</span>sample_id[qc_label<span class="op">$</span>cell_number <span class="op">></span><span class="dv">1</span>]))</a> +<a class="sourceLine" id="cb233-6" data-line-number="6">UpSetR<span class="op">::</span><span class="kw">upset</span>(pl_list,<span class="dt">sets =</span> <span class="kw">c</span>(<span class="st">"pred"</span>,<span class="st">"qc_label"</span>))</a></code></pre></div> <p><img src="exprs-qc_files/figure-html/unnamed-chunk-16-2.png" width="90%" style="display: block; margin: auto;" /></p> <div id="other-tools-available" class="section level4"> <h4><span class="header-section-number">6.3.2.1</span> Other tools available:</h4> @@ -938,7 +971,7 @@ exclude genes where we suspect that technical artefacts may have skewed the results. Moreover, inspection of the gene expression profiles may provide insights about how the experimental procedures could be improved.</p> <p>It is often instructive to consider the number of reads consumed by the top 50 expressed genes.</p> -<div class="sourceCode" id="cb225"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb225-1" data-line-number="1"><span class="kw">plotHighestExprs</span>(umi, <span class="dt">exprs_values =</span> <span class="st">"counts"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb234"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb234-1" data-line-number="1"><span class="kw">plotHighestExprs</span>(umi, <span class="dt">exprs_values =</span> <span class="st">"counts"</span>)</a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:top50-gene-expr"></span> <img src="exprs-qc_files/figure-html/top50-gene-expr-1.png" alt="Number of total counts consumed by the top 50 expressed genes" width="90%" /> <p class="caption"> @@ -960,13 +993,13 @@ least two cells. However, in both cases the threshold strongly depends on the sequencing depth. It is important to keep in mind that genes must be filtered after cell filtering since some genes may only be detected in poor quality cells (<strong>note</strong> <code>colData(umi)$use</code> filter applied to the <code>umi</code> dataset).</p> -<div class="sourceCode" id="cb226"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb226-1" data-line-number="1">keep_feature <-<span class="st"> </span><span class="kw">nexprs</span>(</a> -<a class="sourceLine" id="cb226-2" data-line-number="2"> umi[,<span class="kw">colData</span>(umi)<span class="op">$</span>use], </a> -<a class="sourceLine" id="cb226-3" data-line-number="3"> <span class="dt">byrow =</span> <span class="ot">TRUE</span>, </a> -<a class="sourceLine" id="cb226-4" data-line-number="4"> <span class="dt">detection_limit =</span> <span class="dv">1</span></a> -<a class="sourceLine" id="cb226-5" data-line-number="5">) <span class="op">>=</span><span class="st"> </span><span class="dv">2</span></a> -<a class="sourceLine" id="cb226-6" data-line-number="6"><span class="kw">rowData</span>(umi)<span class="op">$</span>use <-<span class="st"> </span>keep_feature</a></code></pre></div> -<div class="sourceCode" id="cb227"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb227-1" data-line-number="1"><span class="kw">table</span>(keep_feature)</a></code></pre></div> +<div class="sourceCode" id="cb235"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb235-1" data-line-number="1">keep_feature <-<span class="st"> </span><span class="kw">nexprs</span>(</a> +<a class="sourceLine" id="cb235-2" data-line-number="2"> umi[,<span class="kw">colData</span>(umi)<span class="op">$</span>use], </a> +<a class="sourceLine" id="cb235-3" data-line-number="3"> <span class="dt">byrow =</span> <span class="ot">TRUE</span>, </a> +<a class="sourceLine" id="cb235-4" data-line-number="4"> <span class="dt">detection_limit =</span> <span class="dv">1</span></a> +<a class="sourceLine" id="cb235-5" data-line-number="5">) <span class="op">>=</span><span class="st"> </span><span class="dv">2</span></a> +<a class="sourceLine" id="cb235-6" data-line-number="6"><span class="kw">rowData</span>(umi)<span class="op">$</span>use <-<span class="st"> </span>keep_feature</a></code></pre></div> +<div class="sourceCode" id="cb236"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb236-1" data-line-number="1"><span class="kw">table</span>(keep_feature)</a></code></pre></div> <pre><code>## keep_feature ## FALSE TRUE ## 4660 14066</code></pre> @@ -977,14 +1010,14 @@ appropriate.</p> <h3><span class="header-section-number">6.4.3</span> Save the data</h3> <p>Dimensions of the QCed dataset (do not forget about the gene filter we defined above):</p> -<div class="sourceCode" id="cb229"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb229-1" data-line-number="1"><span class="kw">dim</span>(umi[<span class="kw">rowData</span>(umi)<span class="op">$</span>use, <span class="kw">colData</span>(umi)<span class="op">$</span>use])</a></code></pre></div> +<div class="sourceCode" id="cb238"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb238-1" data-line-number="1"><span class="kw">dim</span>(umi[<span class="kw">rowData</span>(umi)<span class="op">$</span>use, <span class="kw">colData</span>(umi)<span class="op">$</span>use])</a></code></pre></div> <pre><code>## [1] 14066 657</code></pre> <p>Let’s create an additional slot with log-transformed counts (we will need it in the next chapters) and remove saved PCA results from the <code>reducedDim</code> slot:</p> -<div class="sourceCode" id="cb231"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb231-1" data-line-number="1"><span class="kw">assay</span>(umi, <span class="st">"logcounts_raw"</span>) <-<span class="st"> </span><span class="kw">log2</span>(<span class="kw">counts</span>(umi) <span class="op">+</span><span class="st"> </span><span class="dv">1</span>)</a> -<a class="sourceLine" id="cb231-2" data-line-number="2"><span class="kw">reducedDim</span>(umi) <-<span class="st"> </span><span class="ot">NULL</span></a></code></pre></div> +<div class="sourceCode" id="cb240"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb240-1" data-line-number="1"><span class="kw">assay</span>(umi, <span class="st">"logcounts_raw"</span>) <-<span class="st"> </span><span class="kw">log2</span>(<span class="kw">counts</span>(umi) <span class="op">+</span><span class="st"> </span><span class="dv">1</span>)</a> +<a class="sourceLine" id="cb240-2" data-line-number="2"><span class="kw">reducedDim</span>(umi) <-<span class="st"> </span><span class="ot">NULL</span></a></code></pre></div> <p>Save the data:</p> -<div class="sourceCode" id="cb232"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb232-1" data-line-number="1"><span class="kw">saveRDS</span>(umi, <span class="dt">file =</span> <span class="st">"data/tung/umi.rds"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb241"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb241-1" data-line-number="1"><span class="kw">saveRDS</span>(umi, <span class="dt">file =</span> <span class="st">"data/tung/umi.rds"</span>)</a></code></pre></div> </div> <div id="big-exercise" class="section level3"> <h3><span class="header-section-number">6.4.4</span> Big Exercise</h3> @@ -1087,12 +1120,12 @@ compare your results to ours (next chapter).</p> </div> <div id="exercise-expression-qc-reads" class="section level2"> <h2><span class="header-section-number">6.5</span> Exercise: Expression QC (Reads)</h2> -<div class="sourceCode" id="cb234"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb234-1" data-line-number="1"><span class="kw">library</span>(SingleCellExperiment)</a> -<a class="sourceLine" id="cb234-2" data-line-number="2"><span class="kw">library</span>(scater)</a> -<a class="sourceLine" id="cb234-3" data-line-number="3"><span class="kw">options</span>(<span class="dt">stringsAsFactors =</span> <span class="ot">FALSE</span>)</a></code></pre></div> -<div class="sourceCode" id="cb235"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb235-1" data-line-number="1">reads <-<span class="st"> </span><span class="kw">read.table</span>(<span class="st">"data/tung/reads.txt"</span>, <span class="dt">sep =</span> <span class="st">"</span><span class="ch">\t</span><span class="st">"</span>)</a> -<a class="sourceLine" id="cb235-2" data-line-number="2">anno <-<span class="st"> </span><span class="kw">read.table</span>(<span class="st">"data/tung/annotation.txt"</span>, <span class="dt">sep =</span> <span class="st">"</span><span class="ch">\t</span><span class="st">"</span>, <span class="dt">header =</span> <span class="ot">TRUE</span>)</a></code></pre></div> -<div class="sourceCode" id="cb236"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb236-1" data-line-number="1"><span class="kw">head</span>(reads[ , <span class="dv">1</span><span class="op">:</span><span class="dv">3</span>])</a></code></pre></div> +<div class="sourceCode" id="cb243"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb243-1" data-line-number="1"><span class="kw">library</span>(SingleCellExperiment)</a> +<a class="sourceLine" id="cb243-2" data-line-number="2"><span class="kw">library</span>(scater)</a> +<a class="sourceLine" id="cb243-3" data-line-number="3"><span class="kw">options</span>(<span class="dt">stringsAsFactors =</span> <span class="ot">FALSE</span>)</a></code></pre></div> +<div class="sourceCode" id="cb244"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb244-1" data-line-number="1">reads <-<span class="st"> </span><span class="kw">read.table</span>(<span class="st">"data/tung/reads.txt"</span>, <span class="dt">sep =</span> <span class="st">"</span><span class="ch">\t</span><span class="st">"</span>)</a> +<a class="sourceLine" id="cb244-2" data-line-number="2">anno <-<span class="st"> </span><span class="kw">read.table</span>(<span class="st">"data/tung/annotation.txt"</span>, <span class="dt">sep =</span> <span class="st">"</span><span class="ch">\t</span><span class="st">"</span>, <span class="dt">header =</span> <span class="ot">TRUE</span>)</a></code></pre></div> +<div class="sourceCode" id="cb245"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb245-1" data-line-number="1"><span class="kw">head</span>(reads[ , <span class="dv">1</span><span class="op">:</span><span class="dv">3</span>])</a></code></pre></div> <pre><code>## NA19098.r1.A01 NA19098.r1.A02 NA19098.r1.A03 ## ENSG00000237683 0 0 0 ## ENSG00000187634 0 0 0 @@ -1100,7 +1133,7 @@ compare your results to ours (next chapter).</p> ## ENSG00000187961 0 0 0 ## ENSG00000187583 0 0 0 ## ENSG00000187642 0 0 0</code></pre> -<div class="sourceCode" id="cb238"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb238-1" data-line-number="1"><span class="kw">head</span>(anno)</a></code></pre></div> +<div class="sourceCode" id="cb247"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb247-1" data-line-number="1"><span class="kw">head</span>(anno)</a></code></pre></div> <pre><code>## individual replicate well batch sample_id ## 1 NA19098 r1 A01 NA19098.r1 NA19098.r1.A01 ## 2 NA19098 r1 A02 NA19098.r1 NA19098.r1.A02 @@ -1108,130 +1141,130 @@ compare your results to ours (next chapter).</p> ## 4 NA19098 r1 A04 NA19098.r1 NA19098.r1.A04 ## 5 NA19098 r1 A05 NA19098.r1 NA19098.r1.A05 ## 6 NA19098 r1 A06 NA19098.r1 NA19098.r1.A06</code></pre> -<div class="sourceCode" id="cb240"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb240-1" data-line-number="1">reads <-<span class="st"> </span><span class="kw">SingleCellExperiment</span>(</a> -<a class="sourceLine" id="cb240-2" data-line-number="2"> <span class="dt">assays =</span> <span class="kw">list</span>(<span class="dt">counts =</span> <span class="kw">as.matrix</span>(reads)), </a> -<a class="sourceLine" id="cb240-3" data-line-number="3"> <span class="dt">colData =</span> anno</a> -<a class="sourceLine" id="cb240-4" data-line-number="4">)</a></code></pre></div> -<div class="sourceCode" id="cb241"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb241-1" data-line-number="1">keep_feature <-<span class="st"> </span><span class="kw">rowSums</span>(<span class="kw">counts</span>(reads) <span class="op">></span><span class="st"> </span><span class="dv">0</span>) <span class="op">></span><span class="st"> </span><span class="dv">0</span></a> -<a class="sourceLine" id="cb241-2" data-line-number="2">reads <-<span class="st"> </span>reads[keep_feature, ]</a></code></pre></div> -<div class="sourceCode" id="cb242"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb242-1" data-line-number="1"><span class="kw">isSpike</span>(reads, <span class="st">"ERCC"</span>) <-<span class="st"> </span><span class="kw">grepl</span>(<span class="st">"^ERCC-"</span>, <span class="kw">rownames</span>(reads))</a> -<a class="sourceLine" id="cb242-2" data-line-number="2"><span class="kw">isSpike</span>(reads, <span class="st">"MT"</span>) <-<span class="st"> </span><span class="kw">rownames</span>(reads) <span class="op">%in%</span><span class="st"> </span></a> -<a class="sourceLine" id="cb242-3" data-line-number="3"><span class="st"> </span><span class="kw">c</span>(<span class="st">"ENSG00000198899"</span>, <span class="st">"ENSG00000198727"</span>, <span class="st">"ENSG00000198888"</span>,</a> -<a class="sourceLine" id="cb242-4" data-line-number="4"> <span class="st">"ENSG00000198886"</span>, <span class="st">"ENSG00000212907"</span>, <span class="st">"ENSG00000198786"</span>,</a> -<a class="sourceLine" id="cb242-5" data-line-number="5"> <span class="st">"ENSG00000198695"</span>, <span class="st">"ENSG00000198712"</span>, <span class="st">"ENSG00000198804"</span>,</a> -<a class="sourceLine" id="cb242-6" data-line-number="6"> <span class="st">"ENSG00000198763"</span>, <span class="st">"ENSG00000228253"</span>, <span class="st">"ENSG00000198938"</span>,</a> -<a class="sourceLine" id="cb242-7" data-line-number="7"> <span class="st">"ENSG00000198840"</span>)</a></code></pre></div> -<div class="sourceCode" id="cb243"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb243-1" data-line-number="1">reads <-<span class="st"> </span><span class="kw">calculateQCMetrics</span>(</a> -<a class="sourceLine" id="cb243-2" data-line-number="2"> reads,</a> -<a class="sourceLine" id="cb243-3" data-line-number="3"> <span class="dt">feature_controls =</span> <span class="kw">list</span>(</a> -<a class="sourceLine" id="cb243-4" data-line-number="4"> <span class="dt">ERCC =</span> <span class="kw">isSpike</span>(reads, <span class="st">"ERCC"</span>), </a> -<a class="sourceLine" id="cb243-5" data-line-number="5"> <span class="dt">MT =</span> <span class="kw">isSpike</span>(reads, <span class="st">"MT"</span>)</a> -<a class="sourceLine" id="cb243-6" data-line-number="6"> )</a> -<a class="sourceLine" id="cb243-7" data-line-number="7">)</a></code></pre></div> +<div class="sourceCode" id="cb249"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb249-1" data-line-number="1">reads <-<span class="st"> </span><span class="kw">SingleCellExperiment</span>(</a> +<a class="sourceLine" id="cb249-2" data-line-number="2"> <span class="dt">assays =</span> <span class="kw">list</span>(<span class="dt">counts =</span> <span class="kw">as.matrix</span>(reads)), </a> +<a class="sourceLine" id="cb249-3" data-line-number="3"> <span class="dt">colData =</span> anno</a> +<a class="sourceLine" id="cb249-4" data-line-number="4">)</a></code></pre></div> +<div class="sourceCode" id="cb250"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb250-1" data-line-number="1">keep_feature <-<span class="st"> </span><span class="kw">rowSums</span>(<span class="kw">counts</span>(reads) <span class="op">></span><span class="st"> </span><span class="dv">0</span>) <span class="op">></span><span class="st"> </span><span class="dv">0</span></a> +<a class="sourceLine" id="cb250-2" data-line-number="2">reads <-<span class="st"> </span>reads[keep_feature, ]</a></code></pre></div> +<div class="sourceCode" id="cb251"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb251-1" data-line-number="1"><span class="kw">isSpike</span>(reads, <span class="st">"ERCC"</span>) <-<span class="st"> </span><span class="kw">grepl</span>(<span class="st">"^ERCC-"</span>, <span class="kw">rownames</span>(reads))</a> +<a class="sourceLine" id="cb251-2" data-line-number="2"><span class="kw">isSpike</span>(reads, <span class="st">"MT"</span>) <-<span class="st"> </span><span class="kw">rownames</span>(reads) <span class="op">%in%</span><span class="st"> </span></a> +<a class="sourceLine" id="cb251-3" data-line-number="3"><span class="st"> </span><span class="kw">c</span>(<span class="st">"ENSG00000198899"</span>, <span class="st">"ENSG00000198727"</span>, <span class="st">"ENSG00000198888"</span>,</a> +<a class="sourceLine" id="cb251-4" data-line-number="4"> <span class="st">"ENSG00000198886"</span>, <span class="st">"ENSG00000212907"</span>, <span class="st">"ENSG00000198786"</span>,</a> +<a class="sourceLine" id="cb251-5" data-line-number="5"> <span class="st">"ENSG00000198695"</span>, <span class="st">"ENSG00000198712"</span>, <span class="st">"ENSG00000198804"</span>,</a> +<a class="sourceLine" id="cb251-6" data-line-number="6"> <span class="st">"ENSG00000198763"</span>, <span class="st">"ENSG00000228253"</span>, <span class="st">"ENSG00000198938"</span>,</a> +<a class="sourceLine" id="cb251-7" data-line-number="7"> <span class="st">"ENSG00000198840"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb252"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb252-1" data-line-number="1">reads <-<span class="st"> </span><span class="kw">calculateQCMetrics</span>(</a> +<a class="sourceLine" id="cb252-2" data-line-number="2"> reads,</a> +<a class="sourceLine" id="cb252-3" data-line-number="3"> <span class="dt">feature_controls =</span> <span class="kw">list</span>(</a> +<a class="sourceLine" id="cb252-4" data-line-number="4"> <span class="dt">ERCC =</span> <span class="kw">isSpike</span>(reads, <span class="st">"ERCC"</span>), </a> +<a class="sourceLine" id="cb252-5" data-line-number="5"> <span class="dt">MT =</span> <span class="kw">isSpike</span>(reads, <span class="st">"MT"</span>)</a> +<a class="sourceLine" id="cb252-6" data-line-number="6"> )</a> +<a class="sourceLine" id="cb252-7" data-line-number="7">)</a></code></pre></div> <pre><code>## Warning in calculateQCMetrics(reads, feature_controls = list(ERCC = ## isSpike(reads, : spike-in set 'ERCC' overwritten by feature_controls set of ## the same name</code></pre> -<div class="sourceCode" id="cb245"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb245-1" data-line-number="1"><span class="kw">hist</span>(</a> -<a class="sourceLine" id="cb245-2" data-line-number="2"> reads<span class="op">$</span>total_counts,</a> -<a class="sourceLine" id="cb245-3" data-line-number="3"> <span class="dt">breaks =</span> <span class="dv">100</span></a> -<a class="sourceLine" id="cb245-4" data-line-number="4">)</a> -<a class="sourceLine" id="cb245-5" data-line-number="5"><span class="kw">abline</span>(<span class="dt">v =</span> <span class="fl">1.3e6</span>, <span class="dt">col =</span> <span class="st">"red"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb254"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb254-1" data-line-number="1"><span class="kw">hist</span>(</a> +<a class="sourceLine" id="cb254-2" data-line-number="2"> reads<span class="op">$</span>total_counts,</a> +<a class="sourceLine" id="cb254-3" data-line-number="3"> <span class="dt">breaks =</span> <span class="dv">100</span></a> +<a class="sourceLine" id="cb254-4" data-line-number="4">)</a> +<a class="sourceLine" id="cb254-5" data-line-number="5"><span class="kw">abline</span>(<span class="dt">v =</span> <span class="fl">1.3e6</span>, <span class="dt">col =</span> <span class="st">"red"</span>)</a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:total-counts-hist-reads"></span> <img src="exprs-qc-reads_files/figure-html/total-counts-hist-reads-1.png" alt="Histogram of library sizes for all cells" width="90%" /> <p class="caption"> Figure 6.7: Histogram of library sizes for all cells </p> </div> -<div class="sourceCode" id="cb246"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb246-1" data-line-number="1">filter_by_total_counts <-<span class="st"> </span>(reads<span class="op">$</span>total_counts <span class="op">></span><span class="st"> </span><span class="fl">1.3e6</span>)</a></code></pre></div> -<div class="sourceCode" id="cb247"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb247-1" data-line-number="1"><span class="kw">table</span>(filter_by_total_counts)</a></code></pre></div> +<div class="sourceCode" id="cb255"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb255-1" data-line-number="1">filter_by_total_counts <-<span class="st"> </span>(reads<span class="op">$</span>total_counts <span class="op">></span><span class="st"> </span><span class="fl">1.3e6</span>)</a></code></pre></div> +<div class="sourceCode" id="cb256"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb256-1" data-line-number="1"><span class="kw">table</span>(filter_by_total_counts)</a></code></pre></div> <pre><code>## filter_by_total_counts ## FALSE TRUE ## 180 684</code></pre> -<div class="sourceCode" id="cb249"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb249-1" data-line-number="1"><span class="kw">hist</span>(</a> -<a class="sourceLine" id="cb249-2" data-line-number="2"> reads<span class="op">$</span>total_features_by_counts,</a> -<a class="sourceLine" id="cb249-3" data-line-number="3"> <span class="dt">breaks =</span> <span class="dv">100</span></a> -<a class="sourceLine" id="cb249-4" data-line-number="4">)</a> -<a class="sourceLine" id="cb249-5" data-line-number="5"><span class="kw">abline</span>(<span class="dt">v =</span> <span class="dv">7000</span>, <span class="dt">col =</span> <span class="st">"red"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb258"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb258-1" data-line-number="1"><span class="kw">hist</span>(</a> +<a class="sourceLine" id="cb258-2" data-line-number="2"> reads<span class="op">$</span>total_features_by_counts,</a> +<a class="sourceLine" id="cb258-3" data-line-number="3"> <span class="dt">breaks =</span> <span class="dv">100</span></a> +<a class="sourceLine" id="cb258-4" data-line-number="4">)</a> +<a class="sourceLine" id="cb258-5" data-line-number="5"><span class="kw">abline</span>(<span class="dt">v =</span> <span class="dv">7000</span>, <span class="dt">col =</span> <span class="st">"red"</span>)</a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:total-features-hist-reads"></span> <img src="exprs-qc-reads_files/figure-html/total-features-hist-reads-1.png" alt="Histogram of the number of detected genes in all cells" width="90%" /> <p class="caption"> Figure 6.8: Histogram of the number of detected genes in all cells </p> </div> -<div class="sourceCode" id="cb250"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb250-1" data-line-number="1">filter_by_expr_features <-<span class="st"> </span>(reads<span class="op">$</span>total_features_by_counts <span class="op">></span><span class="st"> </span><span class="dv">7000</span>)</a></code></pre></div> -<div class="sourceCode" id="cb251"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb251-1" data-line-number="1"><span class="kw">table</span>(filter_by_expr_features)</a></code></pre></div> +<div class="sourceCode" id="cb259"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb259-1" data-line-number="1">filter_by_expr_features <-<span class="st"> </span>(reads<span class="op">$</span>total_features_by_counts <span class="op">></span><span class="st"> </span><span class="dv">7000</span>)</a></code></pre></div> +<div class="sourceCode" id="cb260"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb260-1" data-line-number="1"><span class="kw">table</span>(filter_by_expr_features)</a></code></pre></div> <pre><code>## filter_by_expr_features ## FALSE TRUE ## 116 748</code></pre> -<div class="sourceCode" id="cb253"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb253-1" data-line-number="1"><span class="kw">plotColData</span>(</a> -<a class="sourceLine" id="cb253-2" data-line-number="2"> reads,</a> -<a class="sourceLine" id="cb253-3" data-line-number="3"> <span class="dt">x =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb253-4" data-line-number="4"> <span class="dt">y =</span> <span class="st">"pct_counts_MT"</span>,</a> -<a class="sourceLine" id="cb253-5" data-line-number="5"> <span class="dt">colour =</span> <span class="st">"batch"</span></a> -<a class="sourceLine" id="cb253-6" data-line-number="6">)</a></code></pre></div> +<div class="sourceCode" id="cb262"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb262-1" data-line-number="1"><span class="kw">plotColData</span>(</a> +<a class="sourceLine" id="cb262-2" data-line-number="2"> reads,</a> +<a class="sourceLine" id="cb262-3" data-line-number="3"> <span class="dt">x =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb262-4" data-line-number="4"> <span class="dt">y =</span> <span class="st">"pct_counts_MT"</span>,</a> +<a class="sourceLine" id="cb262-5" data-line-number="5"> <span class="dt">colour =</span> <span class="st">"batch"</span></a> +<a class="sourceLine" id="cb262-6" data-line-number="6">)</a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:mt-vs-counts-reads"></span> <img src="exprs-qc-reads_files/figure-html/mt-vs-counts-reads-1.png" alt="Percentage of counts in MT genes" width="90%" /> <p class="caption"> Figure 6.9: Percentage of counts in MT genes </p> </div> -<div class="sourceCode" id="cb254"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb254-1" data-line-number="1"><span class="kw">plotColData</span>(</a> -<a class="sourceLine" id="cb254-2" data-line-number="2"> reads,</a> -<a class="sourceLine" id="cb254-3" data-line-number="3"> <span class="dt">x =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb254-4" data-line-number="4"> <span class="dt">y =</span> <span class="st">"pct_counts_ERCC"</span>,</a> -<a class="sourceLine" id="cb254-5" data-line-number="5"> <span class="dt">colour =</span> <span class="st">"batch"</span></a> -<a class="sourceLine" id="cb254-6" data-line-number="6">)</a></code></pre></div> +<div class="sourceCode" id="cb263"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb263-1" data-line-number="1"><span class="kw">plotColData</span>(</a> +<a class="sourceLine" id="cb263-2" data-line-number="2"> reads,</a> +<a class="sourceLine" id="cb263-3" data-line-number="3"> <span class="dt">x =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb263-4" data-line-number="4"> <span class="dt">y =</span> <span class="st">"pct_counts_ERCC"</span>,</a> +<a class="sourceLine" id="cb263-5" data-line-number="5"> <span class="dt">colour =</span> <span class="st">"batch"</span></a> +<a class="sourceLine" id="cb263-6" data-line-number="6">)</a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:ercc-vs-counts-reads"></span> <img src="exprs-qc-reads_files/figure-html/ercc-vs-counts-reads-1.png" alt="Percentage of counts in ERCCs" width="90%" /> <p class="caption"> Figure 6.10: Percentage of counts in ERCCs </p> </div> -<div class="sourceCode" id="cb255"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb255-1" data-line-number="1">filter_by_ERCC <-<span class="st"> </span></a> -<a class="sourceLine" id="cb255-2" data-line-number="2"><span class="st"> </span>reads<span class="op">$</span>batch <span class="op">!=</span><span class="st"> "NA19098.r2"</span> <span class="op">&</span><span class="st"> </span>reads<span class="op">$</span>pct_counts_ERCC <span class="op"><</span><span class="st"> </span><span class="dv">25</span></a> -<a class="sourceLine" id="cb255-3" data-line-number="3"><span class="kw">table</span>(filter_by_ERCC)</a></code></pre></div> +<div class="sourceCode" id="cb264"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb264-1" data-line-number="1">filter_by_ERCC <-<span class="st"> </span></a> +<a class="sourceLine" id="cb264-2" data-line-number="2"><span class="st"> </span>reads<span class="op">$</span>batch <span class="op">!=</span><span class="st"> "NA19098.r2"</span> <span class="op">&</span><span class="st"> </span>reads<span class="op">$</span>pct_counts_ERCC <span class="op"><</span><span class="st"> </span><span class="dv">25</span></a> +<a class="sourceLine" id="cb264-3" data-line-number="3"><span class="kw">table</span>(filter_by_ERCC)</a></code></pre></div> <pre><code>## filter_by_ERCC ## FALSE TRUE ## 103 761</code></pre> -<div class="sourceCode" id="cb257"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb257-1" data-line-number="1">filter_by_MT <-<span class="st"> </span>reads<span class="op">$</span>pct_counts_MT <span class="op"><</span><span class="st"> </span><span class="dv">30</span></a> -<a class="sourceLine" id="cb257-2" data-line-number="2"><span class="kw">table</span>(filter_by_MT)</a></code></pre></div> +<div class="sourceCode" id="cb266"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb266-1" data-line-number="1">filter_by_MT <-<span class="st"> </span>reads<span class="op">$</span>pct_counts_MT <span class="op"><</span><span class="st"> </span><span class="dv">30</span></a> +<a class="sourceLine" id="cb266-2" data-line-number="2"><span class="kw">table</span>(filter_by_MT)</a></code></pre></div> <pre><code>## filter_by_MT ## FALSE TRUE ## 18 846</code></pre> -<div class="sourceCode" id="cb259"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb259-1" data-line-number="1">reads<span class="op">$</span>use <-<span class="st"> </span>(</a> -<a class="sourceLine" id="cb259-2" data-line-number="2"> <span class="co"># sufficient features (genes)</span></a> -<a class="sourceLine" id="cb259-3" data-line-number="3"> filter_by_expr_features <span class="op">&</span></a> -<a class="sourceLine" id="cb259-4" data-line-number="4"><span class="st"> </span><span class="co"># sufficient molecules counted</span></a> -<a class="sourceLine" id="cb259-5" data-line-number="5"><span class="st"> </span>filter_by_total_counts <span class="op">&</span></a> -<a class="sourceLine" id="cb259-6" data-line-number="6"><span class="st"> </span><span class="co"># sufficient endogenous RNA</span></a> -<a class="sourceLine" id="cb259-7" data-line-number="7"><span class="st"> </span>filter_by_ERCC <span class="op">&</span></a> -<a class="sourceLine" id="cb259-8" data-line-number="8"><span class="st"> </span><span class="co"># remove cells with unusual number of reads in MT genes</span></a> -<a class="sourceLine" id="cb259-9" data-line-number="9"><span class="st"> </span>filter_by_MT</a> -<a class="sourceLine" id="cb259-10" data-line-number="10">)</a></code></pre></div> -<div class="sourceCode" id="cb260"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb260-1" data-line-number="1"><span class="kw">table</span>(reads<span class="op">$</span>use)</a></code></pre></div> +<div class="sourceCode" id="cb268"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb268-1" data-line-number="1">reads<span class="op">$</span>use <-<span class="st"> </span>(</a> +<a class="sourceLine" id="cb268-2" data-line-number="2"> <span class="co"># sufficient features (genes)</span></a> +<a class="sourceLine" id="cb268-3" data-line-number="3"> filter_by_expr_features <span class="op">&</span></a> +<a class="sourceLine" id="cb268-4" data-line-number="4"><span class="st"> </span><span class="co"># sufficient molecules counted</span></a> +<a class="sourceLine" id="cb268-5" data-line-number="5"><span class="st"> </span>filter_by_total_counts <span class="op">&</span></a> +<a class="sourceLine" id="cb268-6" data-line-number="6"><span class="st"> </span><span class="co"># sufficient endogenous RNA</span></a> +<a class="sourceLine" id="cb268-7" data-line-number="7"><span class="st"> </span>filter_by_ERCC <span class="op">&</span></a> +<a class="sourceLine" id="cb268-8" data-line-number="8"><span class="st"> </span><span class="co"># remove cells with unusual number of reads in MT genes</span></a> +<a class="sourceLine" id="cb268-9" data-line-number="9"><span class="st"> </span>filter_by_MT</a> +<a class="sourceLine" id="cb268-10" data-line-number="10">)</a></code></pre></div> +<div class="sourceCode" id="cb269"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb269-1" data-line-number="1"><span class="kw">table</span>(reads<span class="op">$</span>use)</a></code></pre></div> <pre><code>## ## FALSE TRUE ## 258 606</code></pre> -<div class="sourceCode" id="cb262"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb262-1" data-line-number="1">reads <-<span class="st"> </span><span class="kw">runPCA</span>(</a> -<a class="sourceLine" id="cb262-2" data-line-number="2"> reads,</a> -<a class="sourceLine" id="cb262-3" data-line-number="3"> <span class="dt">use_coldata =</span> <span class="ot">TRUE</span>, </a> -<a class="sourceLine" id="cb262-4" data-line-number="4"> <span class="dt">detect_outliers =</span> <span class="ot">TRUE</span></a> -<a class="sourceLine" id="cb262-5" data-line-number="5">)</a> -<a class="sourceLine" id="cb262-6" data-line-number="6"><span class="kw">reducedDimNames</span>(reads)</a></code></pre></div> +<div class="sourceCode" id="cb271"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb271-1" data-line-number="1">reads <-<span class="st"> </span><span class="kw">runPCA</span>(</a> +<a class="sourceLine" id="cb271-2" data-line-number="2"> reads,</a> +<a class="sourceLine" id="cb271-3" data-line-number="3"> <span class="dt">use_coldata =</span> <span class="ot">TRUE</span>, </a> +<a class="sourceLine" id="cb271-4" data-line-number="4"> <span class="dt">detect_outliers =</span> <span class="ot">TRUE</span></a> +<a class="sourceLine" id="cb271-5" data-line-number="5">)</a> +<a class="sourceLine" id="cb271-6" data-line-number="6"><span class="kw">reducedDimNames</span>(reads)</a></code></pre></div> <pre><code>## [1] "PCA_coldata"</code></pre> -<div class="sourceCode" id="cb264"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb264-1" data-line-number="1"><span class="kw">table</span>(reads<span class="op">$</span>outlier)</a></code></pre></div> +<div class="sourceCode" id="cb273"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb273-1" data-line-number="1"><span class="kw">table</span>(reads<span class="op">$</span>outlier)</a></code></pre></div> <pre><code>## ## FALSE TRUE ## 753 111</code></pre> -<div class="sourceCode" id="cb266"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb266-1" data-line-number="1"><span class="kw">plotReducedDim</span>(</a> -<a class="sourceLine" id="cb266-2" data-line-number="2"> reads,</a> -<a class="sourceLine" id="cb266-3" data-line-number="3"> <span class="dt">use_dimred =</span> <span class="st">"PCA_coldata"</span>,</a> -<a class="sourceLine" id="cb266-4" data-line-number="4"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>, </a> -<a class="sourceLine" id="cb266-5" data-line-number="5"> <span class="dt">shape_by =</span> <span class="st">"use"</span>, </a> -<a class="sourceLine" id="cb266-6" data-line-number="6"> <span class="dt">colour_by =</span> <span class="st">"outlier"</span></a> -<a class="sourceLine" id="cb266-7" data-line-number="7">)</a></code></pre></div> +<div class="sourceCode" id="cb275"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb275-1" data-line-number="1"><span class="kw">plotReducedDim</span>(</a> +<a class="sourceLine" id="cb275-2" data-line-number="2"> reads,</a> +<a class="sourceLine" id="cb275-3" data-line-number="3"> <span class="dt">use_dimred =</span> <span class="st">"PCA_coldata"</span>,</a> +<a class="sourceLine" id="cb275-4" data-line-number="4"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>, </a> +<a class="sourceLine" id="cb275-5" data-line-number="5"> <span class="dt">shape_by =</span> <span class="st">"use"</span>, </a> +<a class="sourceLine" id="cb275-6" data-line-number="6"> <span class="dt">colour_by =</span> <span class="st">"outlier"</span></a> +<a class="sourceLine" id="cb275-7" data-line-number="7">)</a></code></pre></div> <p><img src="exprs-qc-reads_files/figure-html/unnamed-chunk-16-1.png" width="90%" style="display: block; margin: auto;" /></p> -<div class="sourceCode" id="cb267"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb267-1" data-line-number="1"><span class="kw">library</span>(limma)</a></code></pre></div> +<div class="sourceCode" id="cb276"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb276-1" data-line-number="1"><span class="kw">library</span>(limma)</a></code></pre></div> <pre><code>## ## Attaching package: 'limma'</code></pre> <pre><code>## The following object is masked from 'package:scater': @@ -1240,47 +1273,47 @@ Figure 6.10: Percentage of counts in ERCCs <pre><code>## The following object is masked from 'package:BiocGenerics': ## ## plotMA</code></pre> -<div class="sourceCode" id="cb271"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb271-1" data-line-number="1">auto <-<span class="st"> </span><span class="kw">colnames</span>(reads)[reads<span class="op">$</span>outlier]</a> -<a class="sourceLine" id="cb271-2" data-line-number="2">man <-<span class="st"> </span><span class="kw">colnames</span>(reads)[<span class="op">!</span>reads<span class="op">$</span>use]</a> -<a class="sourceLine" id="cb271-3" data-line-number="3">venn.diag <-<span class="st"> </span><span class="kw">vennCounts</span>(</a> -<a class="sourceLine" id="cb271-4" data-line-number="4"> <span class="kw">cbind</span>(<span class="kw">colnames</span>(reads) <span class="op">%in%</span><span class="st"> </span>auto,</a> -<a class="sourceLine" id="cb271-5" data-line-number="5"> <span class="kw">colnames</span>(reads) <span class="op">%in%</span><span class="st"> </span>man)</a> -<a class="sourceLine" id="cb271-6" data-line-number="6">)</a> -<a class="sourceLine" id="cb271-7" data-line-number="7"><span class="kw">vennDiagram</span>(</a> -<a class="sourceLine" id="cb271-8" data-line-number="8"> venn.diag,</a> -<a class="sourceLine" id="cb271-9" data-line-number="9"> <span class="dt">names =</span> <span class="kw">c</span>(<span class="st">"Automatic"</span>, <span class="st">"Manual"</span>),</a> -<a class="sourceLine" id="cb271-10" data-line-number="10"> <span class="dt">circle.col =</span> <span class="kw">c</span>(<span class="st">"blue"</span>, <span class="st">"green"</span>)</a> -<a class="sourceLine" id="cb271-11" data-line-number="11">)</a></code></pre></div> +<div class="sourceCode" id="cb280"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb280-1" data-line-number="1">auto <-<span class="st"> </span><span class="kw">colnames</span>(reads)[reads<span class="op">$</span>outlier]</a> +<a class="sourceLine" id="cb280-2" data-line-number="2">man <-<span class="st"> </span><span class="kw">colnames</span>(reads)[<span class="op">!</span>reads<span class="op">$</span>use]</a> +<a class="sourceLine" id="cb280-3" data-line-number="3">venn.diag <-<span class="st"> </span><span class="kw">vennCounts</span>(</a> +<a class="sourceLine" id="cb280-4" data-line-number="4"> <span class="kw">cbind</span>(<span class="kw">colnames</span>(reads) <span class="op">%in%</span><span class="st"> </span>auto,</a> +<a class="sourceLine" id="cb280-5" data-line-number="5"> <span class="kw">colnames</span>(reads) <span class="op">%in%</span><span class="st"> </span>man)</a> +<a class="sourceLine" id="cb280-6" data-line-number="6">)</a> +<a class="sourceLine" id="cb280-7" data-line-number="7"><span class="kw">vennDiagram</span>(</a> +<a class="sourceLine" id="cb280-8" data-line-number="8"> venn.diag,</a> +<a class="sourceLine" id="cb280-9" data-line-number="9"> <span class="dt">names =</span> <span class="kw">c</span>(<span class="st">"Automatic"</span>, <span class="st">"Manual"</span>),</a> +<a class="sourceLine" id="cb280-10" data-line-number="10"> <span class="dt">circle.col =</span> <span class="kw">c</span>(<span class="st">"blue"</span>, <span class="st">"green"</span>)</a> +<a class="sourceLine" id="cb280-11" data-line-number="11">)</a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:cell-filt-comp-reads"></span> <img src="exprs-qc-reads_files/figure-html/cell-filt-comp-reads-1.png" alt="Comparison of the default, automatic and manual cell filters" width="90%" /> <p class="caption"> Figure 6.11: Comparison of the default, automatic and manual cell filters </p> </div> -<div class="sourceCode" id="cb272"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb272-1" data-line-number="1"><span class="kw">plotHighestExprs</span>(reads, <span class="dt">exprs_values =</span> <span class="st">"counts"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb281"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb281-1" data-line-number="1"><span class="kw">plotHighestExprs</span>(reads, <span class="dt">exprs_values =</span> <span class="st">"counts"</span>)</a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:top50-gene-expr-reads"></span> <img src="exprs-qc-reads_files/figure-html/top50-gene-expr-reads-1.png" alt="Number of total counts consumed by the top 50 expressed genes" width="90%" /> <p class="caption"> Figure 6.12: Number of total counts consumed by the top 50 expressed genes </p> </div> -<div class="sourceCode" id="cb273"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb273-1" data-line-number="1">keep_feature <-<span class="st"> </span><span class="kw">nexprs</span>(</a> -<a class="sourceLine" id="cb273-2" data-line-number="2"> reads[,<span class="kw">colData</span>(reads)<span class="op">$</span>use], </a> -<a class="sourceLine" id="cb273-3" data-line-number="3"> <span class="dt">byrow =</span> <span class="ot">TRUE</span>, </a> -<a class="sourceLine" id="cb273-4" data-line-number="4"> <span class="dt">detection_limit =</span> <span class="dv">1</span></a> -<a class="sourceLine" id="cb273-5" data-line-number="5">) <span class="op">>=</span><span class="st"> </span><span class="dv">2</span></a> -<a class="sourceLine" id="cb273-6" data-line-number="6"><span class="kw">rowData</span>(reads)<span class="op">$</span>use <-<span class="st"> </span>keep_feature</a></code></pre></div> -<div class="sourceCode" id="cb274"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb274-1" data-line-number="1"><span class="kw">table</span>(keep_feature)</a></code></pre></div> +<div class="sourceCode" id="cb282"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb282-1" data-line-number="1">keep_feature <-<span class="st"> </span><span class="kw">nexprs</span>(</a> +<a class="sourceLine" id="cb282-2" data-line-number="2"> reads[,<span class="kw">colData</span>(reads)<span class="op">$</span>use], </a> +<a class="sourceLine" id="cb282-3" data-line-number="3"> <span class="dt">byrow =</span> <span class="ot">TRUE</span>, </a> +<a class="sourceLine" id="cb282-4" data-line-number="4"> <span class="dt">detection_limit =</span> <span class="dv">1</span></a> +<a class="sourceLine" id="cb282-5" data-line-number="5">) <span class="op">>=</span><span class="st"> </span><span class="dv">2</span></a> +<a class="sourceLine" id="cb282-6" data-line-number="6"><span class="kw">rowData</span>(reads)<span class="op">$</span>use <-<span class="st"> </span>keep_feature</a></code></pre></div> +<div class="sourceCode" id="cb283"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb283-1" data-line-number="1"><span class="kw">table</span>(keep_feature)</a></code></pre></div> <pre><code>## keep_feature ## FALSE TRUE ## 2664 16062</code></pre> -<div class="sourceCode" id="cb276"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb276-1" data-line-number="1"><span class="kw">dim</span>(reads[<span class="kw">rowData</span>(reads)<span class="op">$</span>use, <span class="kw">colData</span>(reads)<span class="op">$</span>use])</a></code></pre></div> +<div class="sourceCode" id="cb285"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb285-1" data-line-number="1"><span class="kw">dim</span>(reads[<span class="kw">rowData</span>(reads)<span class="op">$</span>use, <span class="kw">colData</span>(reads)<span class="op">$</span>use])</a></code></pre></div> <pre><code>## [1] 16062 606</code></pre> -<div class="sourceCode" id="cb278"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb278-1" data-line-number="1"><span class="kw">assay</span>(reads, <span class="st">"logcounts_raw"</span>) <-<span class="st"> </span><span class="kw">log2</span>(<span class="kw">counts</span>(reads) <span class="op">+</span><span class="st"> </span><span class="dv">1</span>)</a> -<a class="sourceLine" id="cb278-2" data-line-number="2"><span class="kw">reducedDim</span>(reads) <-<span class="st"> </span><span class="ot">NULL</span></a></code></pre></div> -<div class="sourceCode" id="cb279"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb279-1" data-line-number="1"><span class="kw">saveRDS</span>(reads, <span class="dt">file =</span> <span class="st">"data/tung/reads.rds"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb287"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb287-1" data-line-number="1"><span class="kw">assay</span>(reads, <span class="st">"logcounts_raw"</span>) <-<span class="st"> </span><span class="kw">log2</span>(<span class="kw">counts</span>(reads) <span class="op">+</span><span class="st"> </span><span class="dv">1</span>)</a> +<a class="sourceLine" id="cb287-2" data-line-number="2"><span class="kw">reducedDim</span>(reads) <-<span class="st"> </span><span class="ot">NULL</span></a></code></pre></div> +<div class="sourceCode" id="cb288"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb288-1" data-line-number="1"><span class="kw">saveRDS</span>(reads, <span class="dt">file =</span> <span class="st">"data/tung/reads.rds"</span>)</a></code></pre></div> <p>By comparing Figure <a href="quality-control-and-data-visualisation.html#fig:cell-filt-comp">6.5</a> and Figure <a href="quality-control-and-data-visualisation.html#fig:cell-filt-comp-reads">6.11</a>, it is clear that the reads based filtering removed more cells than the UMI based analysis. If you go back and compare the results you should be able to conclude that the ERCC and MT filters are more strict for the reads-based analysis.</p> -<div class="sourceCode" id="cb280"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb280-1" data-line-number="1"><span class="kw">sessionInfo</span>()</a></code></pre></div> +<div class="sourceCode" id="cb289"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb289-1" data-line-number="1"><span class="kw">sessionInfo</span>()</a></code></pre></div> <pre><code>## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS @@ -1395,12 +1428,12 @@ groups corresponding to each individual.</p> to get a “feel†for a dataset. This is an area of data analysis that is perhaps <a href="https://www.youtube.com/watch?v=W5JqB6e5QwU">more art than science</a>, but is a crucial aspect of single-cell QC and analysis.</p> -<div class="sourceCode" id="cb282"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb282-1" data-line-number="1"><span class="kw">library</span>(SingleCellExperiment)</a> -<a class="sourceLine" id="cb282-2" data-line-number="2"><span class="kw">library</span>(scater)</a> -<a class="sourceLine" id="cb282-3" data-line-number="3"><span class="kw">options</span>(<span class="dt">stringsAsFactors =</span> <span class="ot">FALSE</span>)</a> -<a class="sourceLine" id="cb282-4" data-line-number="4">umi <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/tung/umi.rds"</span>)</a> -<a class="sourceLine" id="cb282-5" data-line-number="5">umi.qc <-<span class="st"> </span>umi[<span class="kw">rowData</span>(umi)<span class="op">$</span>use, <span class="kw">colData</span>(umi)<span class="op">$</span>use]</a> -<a class="sourceLine" id="cb282-6" data-line-number="6">endog_genes <-<span class="st"> </span><span class="op">!</span><span class="kw">rowData</span>(umi.qc)<span class="op">$</span>is_feature_control</a></code></pre></div> +<div class="sourceCode" id="cb291"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb291-1" data-line-number="1"><span class="kw">library</span>(SingleCellExperiment)</a> +<a class="sourceLine" id="cb291-2" data-line-number="2"><span class="kw">library</span>(scater)</a> +<a class="sourceLine" id="cb291-3" data-line-number="3"><span class="kw">options</span>(<span class="dt">stringsAsFactors =</span> <span class="ot">FALSE</span>)</a> +<a class="sourceLine" id="cb291-4" data-line-number="4">umi <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/tung/umi.rds"</span>)</a> +<a class="sourceLine" id="cb291-5" data-line-number="5">umi.qc <-<span class="st"> </span>umi[<span class="kw">rowData</span>(umi)<span class="op">$</span>use, <span class="kw">colData</span>(umi)<span class="op">$</span>use]</a> +<a class="sourceLine" id="cb291-6" data-line-number="6">endog_genes <-<span class="st"> </span><span class="op">!</span><span class="kw">rowData</span>(umi.qc)<span class="op">$</span>is_feature_control</a></code></pre></div> </div> <div id="visual-pca" class="section level3"> <h3><span class="header-section-number">6.6.2</span> PCA plot</h3> @@ -1428,16 +1461,16 @@ Figure 6.13: Schematic representation of PCA dimensionality reduction <div id="before-qc" class="section level4"> <h4><span class="header-section-number">6.6.2.1</span> Before QC</h4> <p>Without log-transformation:</p> -<div class="sourceCode" id="cb283"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb283-1" data-line-number="1">tmp <-<span class="st"> </span><span class="kw">runPCA</span>(</a> -<a class="sourceLine" id="cb283-2" data-line-number="2"> umi[endog_genes, ],</a> -<a class="sourceLine" id="cb283-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"counts"</span></a> -<a class="sourceLine" id="cb283-4" data-line-number="4">)</a> -<a class="sourceLine" id="cb283-5" data-line-number="5"><span class="kw">plotPCA</span>(</a> -<a class="sourceLine" id="cb283-6" data-line-number="6"> tmp,</a> -<a class="sourceLine" id="cb283-7" data-line-number="7"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> -<a class="sourceLine" id="cb283-8" data-line-number="8"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb283-9" data-line-number="9"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> -<a class="sourceLine" id="cb283-10" data-line-number="10">)</a></code></pre></div> +<div class="sourceCode" id="cb292"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb292-1" data-line-number="1">tmp <-<span class="st"> </span><span class="kw">runPCA</span>(</a> +<a class="sourceLine" id="cb292-2" data-line-number="2"> umi[endog_genes, ],</a> +<a class="sourceLine" id="cb292-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"counts"</span></a> +<a class="sourceLine" id="cb292-4" data-line-number="4">)</a> +<a class="sourceLine" id="cb292-5" data-line-number="5"><span class="kw">plotPCA</span>(</a> +<a class="sourceLine" id="cb292-6" data-line-number="6"> tmp,</a> +<a class="sourceLine" id="cb292-7" data-line-number="7"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> +<a class="sourceLine" id="cb292-8" data-line-number="8"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb292-9" data-line-number="9"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> +<a class="sourceLine" id="cb292-10" data-line-number="10">)</a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:expr-overview-pca-before-qc1"></span> <img src="data-viz_files/figure-html/expr-overview-pca-before-qc1-1.png" alt="PCA plot of the tung data" width="90%" /> <p class="caption"> @@ -1445,16 +1478,16 @@ Figure 6.14: PCA plot of the tung data </p> </div> <p>With log-transformation:</p> -<div class="sourceCode" id="cb284"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb284-1" data-line-number="1">tmp <-<span class="st"> </span><span class="kw">runPCA</span>(</a> -<a class="sourceLine" id="cb284-2" data-line-number="2"> umi[endog_genes, ],</a> -<a class="sourceLine" id="cb284-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"logcounts_raw"</span></a> -<a class="sourceLine" id="cb284-4" data-line-number="4">)</a> -<a class="sourceLine" id="cb284-5" data-line-number="5"><span class="kw">plotPCA</span>(</a> -<a class="sourceLine" id="cb284-6" data-line-number="6"> tmp,</a> -<a class="sourceLine" id="cb284-7" data-line-number="7"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> -<a class="sourceLine" id="cb284-8" data-line-number="8"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb284-9" data-line-number="9"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> -<a class="sourceLine" id="cb284-10" data-line-number="10">)</a></code></pre></div> +<div class="sourceCode" id="cb293"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb293-1" data-line-number="1">tmp <-<span class="st"> </span><span class="kw">runPCA</span>(</a> +<a class="sourceLine" id="cb293-2" data-line-number="2"> umi[endog_genes, ],</a> +<a class="sourceLine" id="cb293-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"logcounts_raw"</span></a> +<a class="sourceLine" id="cb293-4" data-line-number="4">)</a> +<a class="sourceLine" id="cb293-5" data-line-number="5"><span class="kw">plotPCA</span>(</a> +<a class="sourceLine" id="cb293-6" data-line-number="6"> tmp,</a> +<a class="sourceLine" id="cb293-7" data-line-number="7"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> +<a class="sourceLine" id="cb293-8" data-line-number="8"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb293-9" data-line-number="9"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> +<a class="sourceLine" id="cb293-10" data-line-number="10">)</a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:expr-overview-pca-before-qc2"></span> <img src="data-viz_files/figure-html/expr-overview-pca-before-qc2-1.png" alt="PCA plot of the tung data" width="90%" /> <p class="caption"> @@ -1476,16 +1509,16 @@ normalised by library size (e.g. CPM normalisation). In the course we use </div> <div id="after-qc" class="section level4"> <h4><span class="header-section-number">6.6.2.2</span> After QC</h4> -<div class="sourceCode" id="cb285"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb285-1" data-line-number="1">tmp <-<span class="st"> </span><span class="kw">runPCA</span>(</a> -<a class="sourceLine" id="cb285-2" data-line-number="2"> umi.qc[endog_genes, ],</a> -<a class="sourceLine" id="cb285-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"logcounts_raw"</span></a> -<a class="sourceLine" id="cb285-4" data-line-number="4">)</a> -<a class="sourceLine" id="cb285-5" data-line-number="5"><span class="kw">plotPCA</span>(</a> -<a class="sourceLine" id="cb285-6" data-line-number="6"> tmp,</a> -<a class="sourceLine" id="cb285-7" data-line-number="7"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> -<a class="sourceLine" id="cb285-8" data-line-number="8"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb285-9" data-line-number="9"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> -<a class="sourceLine" id="cb285-10" data-line-number="10">)</a></code></pre></div> +<div class="sourceCode" id="cb294"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb294-1" data-line-number="1">tmp <-<span class="st"> </span><span class="kw">runPCA</span>(</a> +<a class="sourceLine" id="cb294-2" data-line-number="2"> umi.qc[endog_genes, ],</a> +<a class="sourceLine" id="cb294-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"logcounts_raw"</span></a> +<a class="sourceLine" id="cb294-4" data-line-number="4">)</a> +<a class="sourceLine" id="cb294-5" data-line-number="5"><span class="kw">plotPCA</span>(</a> +<a class="sourceLine" id="cb294-6" data-line-number="6"> tmp,</a> +<a class="sourceLine" id="cb294-7" data-line-number="7"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> +<a class="sourceLine" id="cb294-8" data-line-number="8"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb294-9" data-line-number="9"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> +<a class="sourceLine" id="cb294-10" data-line-number="10">)</a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:expr-overview-pca-after-qc"></span> <img src="data-viz_files/figure-html/expr-overview-pca-after-qc-1.png" alt="PCA plot of the tung data" width="90%" /> <p class="caption"> @@ -1558,18 +1591,18 @@ ensure reproducibility, we fix the “seed†of the random-number generator in code below so that we always get the same plot.</p> <div id="before-qc-1" class="section level4"> <h4><span class="header-section-number">6.6.3.1</span> Before QC</h4> -<div class="sourceCode" id="cb286"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb286-1" data-line-number="1"><span class="kw">set.seed</span>(<span class="dv">123456</span>)</a> -<a class="sourceLine" id="cb286-2" data-line-number="2">tmp <-<span class="st"> </span><span class="kw">runTSNE</span>(</a> -<a class="sourceLine" id="cb286-3" data-line-number="3"> umi[endog_genes, ],</a> -<a class="sourceLine" id="cb286-4" data-line-number="4"> <span class="dt">exprs_values =</span> <span class="st">"logcounts_raw"</span>,</a> -<a class="sourceLine" id="cb286-5" data-line-number="5"> <span class="dt">perplexity =</span> <span class="dv">130</span></a> -<a class="sourceLine" id="cb286-6" data-line-number="6">)</a> -<a class="sourceLine" id="cb286-7" data-line-number="7"><span class="kw">plotTSNE</span>(</a> -<a class="sourceLine" id="cb286-8" data-line-number="8"> tmp,</a> -<a class="sourceLine" id="cb286-9" data-line-number="9"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> -<a class="sourceLine" id="cb286-10" data-line-number="10"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb286-11" data-line-number="11"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> -<a class="sourceLine" id="cb286-12" data-line-number="12">)</a></code></pre></div> +<div class="sourceCode" id="cb295"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb295-1" data-line-number="1"><span class="kw">set.seed</span>(<span class="dv">123456</span>)</a> +<a class="sourceLine" id="cb295-2" data-line-number="2">tmp <-<span class="st"> </span><span class="kw">runTSNE</span>(</a> +<a class="sourceLine" id="cb295-3" data-line-number="3"> umi[endog_genes, ],</a> +<a class="sourceLine" id="cb295-4" data-line-number="4"> <span class="dt">exprs_values =</span> <span class="st">"logcounts_raw"</span>,</a> +<a class="sourceLine" id="cb295-5" data-line-number="5"> <span class="dt">perplexity =</span> <span class="dv">130</span></a> +<a class="sourceLine" id="cb295-6" data-line-number="6">)</a> +<a class="sourceLine" id="cb295-7" data-line-number="7"><span class="kw">plotTSNE</span>(</a> +<a class="sourceLine" id="cb295-8" data-line-number="8"> tmp,</a> +<a class="sourceLine" id="cb295-9" data-line-number="9"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> +<a class="sourceLine" id="cb295-10" data-line-number="10"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb295-11" data-line-number="11"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> +<a class="sourceLine" id="cb295-12" data-line-number="12">)</a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:expr-overview-tsne-before-qc"></span> <img src="data-viz_files/figure-html/expr-overview-tsne-before-qc-1.png" alt="tSNE map of the tung data" width="90%" /> <p class="caption"> @@ -1579,18 +1612,18 @@ Figure 6.19: tSNE map of the tung data </div> <div id="after-qc-1" class="section level4"> <h4><span class="header-section-number">6.6.3.2</span> After QC</h4> -<div class="sourceCode" id="cb287"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb287-1" data-line-number="1"><span class="kw">set.seed</span>(<span class="dv">123456</span>)</a> -<a class="sourceLine" id="cb287-2" data-line-number="2">tmp <-<span class="st"> </span><span class="kw">runTSNE</span>(</a> -<a class="sourceLine" id="cb287-3" data-line-number="3"> umi.qc[endog_genes, ],</a> -<a class="sourceLine" id="cb287-4" data-line-number="4"> <span class="dt">exprs_values =</span> <span class="st">"logcounts_raw"</span>,</a> -<a class="sourceLine" id="cb287-5" data-line-number="5"> <span class="dt">perplexity =</span> <span class="dv">130</span></a> -<a class="sourceLine" id="cb287-6" data-line-number="6">)</a> -<a class="sourceLine" id="cb287-7" data-line-number="7"><span class="kw">plotTSNE</span>(</a> -<a class="sourceLine" id="cb287-8" data-line-number="8"> tmp,</a> -<a class="sourceLine" id="cb287-9" data-line-number="9"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> -<a class="sourceLine" id="cb287-10" data-line-number="10"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb287-11" data-line-number="11"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> -<a class="sourceLine" id="cb287-12" data-line-number="12">)</a></code></pre></div> +<div class="sourceCode" id="cb296"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb296-1" data-line-number="1"><span class="kw">set.seed</span>(<span class="dv">123456</span>)</a> +<a class="sourceLine" id="cb296-2" data-line-number="2">tmp <-<span class="st"> </span><span class="kw">runTSNE</span>(</a> +<a class="sourceLine" id="cb296-3" data-line-number="3"> umi.qc[endog_genes, ],</a> +<a class="sourceLine" id="cb296-4" data-line-number="4"> <span class="dt">exprs_values =</span> <span class="st">"logcounts_raw"</span>,</a> +<a class="sourceLine" id="cb296-5" data-line-number="5"> <span class="dt">perplexity =</span> <span class="dv">130</span></a> +<a class="sourceLine" id="cb296-6" data-line-number="6">)</a> +<a class="sourceLine" id="cb296-7" data-line-number="7"><span class="kw">plotTSNE</span>(</a> +<a class="sourceLine" id="cb296-8" data-line-number="8"> tmp,</a> +<a class="sourceLine" id="cb296-9" data-line-number="9"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> +<a class="sourceLine" id="cb296-10" data-line-number="10"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb296-11" data-line-number="11"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> +<a class="sourceLine" id="cb296-12" data-line-number="12">)</a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:expr-overview-tsne-after-qc"></span> <img src="data-viz_files/figure-html/expr-overview-tsne-after-qc-1.png" alt="tSNE map of the tung data" width="90%" /> <p class="caption"> @@ -1705,89 +1738,89 @@ please compare your results to ours (next chapter).</p> </div> <div id="exercise-data-visualization-reads" class="section level2"> <h2><span class="header-section-number">6.7</span> Exercise: Data visualization (Reads)</h2> -<div class="sourceCode" id="cb289"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb289-1" data-line-number="1"><span class="kw">library</span>(scater)</a> -<a class="sourceLine" id="cb289-2" data-line-number="2"><span class="kw">options</span>(<span class="dt">stringsAsFactors =</span> <span class="ot">FALSE</span>)</a> -<a class="sourceLine" id="cb289-3" data-line-number="3">reads <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/tung/reads.rds"</span>)</a> -<a class="sourceLine" id="cb289-4" data-line-number="4">reads.qc <-<span class="st"> </span>reads[<span class="kw">rowData</span>(reads)<span class="op">$</span>use, <span class="kw">colData</span>(reads)<span class="op">$</span>use]</a> -<a class="sourceLine" id="cb289-5" data-line-number="5">endog_genes <-<span class="st"> </span><span class="op">!</span><span class="kw">rowData</span>(reads.qc)<span class="op">$</span>is_feature_control</a></code></pre></div> -<div class="sourceCode" id="cb290"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb290-1" data-line-number="1">tmp <-<span class="st"> </span><span class="kw">runPCA</span>(</a> -<a class="sourceLine" id="cb290-2" data-line-number="2"> reads[endog_genes, ],</a> -<a class="sourceLine" id="cb290-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"counts"</span></a> -<a class="sourceLine" id="cb290-4" data-line-number="4">)</a> -<a class="sourceLine" id="cb290-5" data-line-number="5"><span class="kw">plotPCA</span>(</a> -<a class="sourceLine" id="cb290-6" data-line-number="6"> tmp,</a> -<a class="sourceLine" id="cb290-7" data-line-number="7"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> -<a class="sourceLine" id="cb290-8" data-line-number="8"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb290-9" data-line-number="9"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> -<a class="sourceLine" id="cb290-10" data-line-number="10">)</a></code></pre></div> +<div class="sourceCode" id="cb298"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb298-1" data-line-number="1"><span class="kw">library</span>(scater)</a> +<a class="sourceLine" id="cb298-2" data-line-number="2"><span class="kw">options</span>(<span class="dt">stringsAsFactors =</span> <span class="ot">FALSE</span>)</a> +<a class="sourceLine" id="cb298-3" data-line-number="3">reads <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/tung/reads.rds"</span>)</a> +<a class="sourceLine" id="cb298-4" data-line-number="4">reads.qc <-<span class="st"> </span>reads[<span class="kw">rowData</span>(reads)<span class="op">$</span>use, <span class="kw">colData</span>(reads)<span class="op">$</span>use]</a> +<a class="sourceLine" id="cb298-5" data-line-number="5">endog_genes <-<span class="st"> </span><span class="op">!</span><span class="kw">rowData</span>(reads.qc)<span class="op">$</span>is_feature_control</a></code></pre></div> +<div class="sourceCode" id="cb299"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb299-1" data-line-number="1">tmp <-<span class="st"> </span><span class="kw">runPCA</span>(</a> +<a class="sourceLine" id="cb299-2" data-line-number="2"> reads[endog_genes, ],</a> +<a class="sourceLine" id="cb299-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"counts"</span></a> +<a class="sourceLine" id="cb299-4" data-line-number="4">)</a> +<a class="sourceLine" id="cb299-5" data-line-number="5"><span class="kw">plotPCA</span>(</a> +<a class="sourceLine" id="cb299-6" data-line-number="6"> tmp,</a> +<a class="sourceLine" id="cb299-7" data-line-number="7"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> +<a class="sourceLine" id="cb299-8" data-line-number="8"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb299-9" data-line-number="9"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> +<a class="sourceLine" id="cb299-10" data-line-number="10">)</a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:expr-overview-pca-before-qc-reads1"></span> <img src="data-viz-reads_files/figure-html/expr-overview-pca-before-qc-reads1-1.png" alt="PCA plot of the tung data" width="90%" /> <p class="caption"> Figure 6.23: PCA plot of the tung data </p> </div> -<div class="sourceCode" id="cb291"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb291-1" data-line-number="1">tmp <-<span class="st"> </span><span class="kw">runPCA</span>(</a> -<a class="sourceLine" id="cb291-2" data-line-number="2"> reads[endog_genes, ],</a> -<a class="sourceLine" id="cb291-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"logcounts_raw"</span></a> -<a class="sourceLine" id="cb291-4" data-line-number="4">)</a> -<a class="sourceLine" id="cb291-5" data-line-number="5"><span class="kw">plotPCA</span>(</a> -<a class="sourceLine" id="cb291-6" data-line-number="6"> tmp,</a> -<a class="sourceLine" id="cb291-7" data-line-number="7"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> -<a class="sourceLine" id="cb291-8" data-line-number="8"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb291-9" data-line-number="9"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> -<a class="sourceLine" id="cb291-10" data-line-number="10">)</a></code></pre></div> +<div class="sourceCode" id="cb300"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb300-1" data-line-number="1">tmp <-<span class="st"> </span><span class="kw">runPCA</span>(</a> +<a class="sourceLine" id="cb300-2" data-line-number="2"> reads[endog_genes, ],</a> +<a class="sourceLine" id="cb300-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"logcounts_raw"</span></a> +<a class="sourceLine" id="cb300-4" data-line-number="4">)</a> +<a class="sourceLine" id="cb300-5" data-line-number="5"><span class="kw">plotPCA</span>(</a> +<a class="sourceLine" id="cb300-6" data-line-number="6"> tmp,</a> +<a class="sourceLine" id="cb300-7" data-line-number="7"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> +<a class="sourceLine" id="cb300-8" data-line-number="8"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb300-9" data-line-number="9"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> +<a class="sourceLine" id="cb300-10" data-line-number="10">)</a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:expr-overview-pca-before-qc-reads2"></span> <img src="data-viz-reads_files/figure-html/expr-overview-pca-before-qc-reads2-1.png" alt="PCA plot of the tung data" width="90%" /> <p class="caption"> Figure 6.24: PCA plot of the tung data </p> </div> -<div class="sourceCode" id="cb292"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb292-1" data-line-number="1">tmp <-<span class="st"> </span><span class="kw">runPCA</span>(</a> -<a class="sourceLine" id="cb292-2" data-line-number="2"> reads.qc[endog_genes, ],</a> -<a class="sourceLine" id="cb292-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"logcounts_raw"</span></a> -<a class="sourceLine" id="cb292-4" data-line-number="4">)</a> -<a class="sourceLine" id="cb292-5" data-line-number="5"><span class="kw">plotPCA</span>(</a> -<a class="sourceLine" id="cb292-6" data-line-number="6"> tmp,</a> -<a class="sourceLine" id="cb292-7" data-line-number="7"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> -<a class="sourceLine" id="cb292-8" data-line-number="8"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb292-9" data-line-number="9"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> -<a class="sourceLine" id="cb292-10" data-line-number="10">)</a></code></pre></div> +<div class="sourceCode" id="cb301"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb301-1" data-line-number="1">tmp <-<span class="st"> </span><span class="kw">runPCA</span>(</a> +<a class="sourceLine" id="cb301-2" data-line-number="2"> reads.qc[endog_genes, ],</a> +<a class="sourceLine" id="cb301-3" data-line-number="3"> <span class="dt">exprs_values =</span> <span class="st">"logcounts_raw"</span></a> +<a class="sourceLine" id="cb301-4" data-line-number="4">)</a> +<a class="sourceLine" id="cb301-5" data-line-number="5"><span class="kw">plotPCA</span>(</a> +<a class="sourceLine" id="cb301-6" data-line-number="6"> tmp,</a> +<a class="sourceLine" id="cb301-7" data-line-number="7"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> +<a class="sourceLine" id="cb301-8" data-line-number="8"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb301-9" data-line-number="9"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> +<a class="sourceLine" id="cb301-10" data-line-number="10">)</a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:expr-overview-pca-after-qc-reads"></span> <img src="data-viz-reads_files/figure-html/expr-overview-pca-after-qc-reads-1.png" alt="PCA plot of the tung data" width="90%" /> <p class="caption"> Figure 6.25: PCA plot of the tung data </p> </div> -<div class="sourceCode" id="cb293"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb293-1" data-line-number="1"><span class="kw">set.seed</span>(<span class="dv">123456</span>)</a> -<a class="sourceLine" id="cb293-2" data-line-number="2">tmp <-<span class="st"> </span><span class="kw">runTSNE</span>(</a> -<a class="sourceLine" id="cb293-3" data-line-number="3"> reads[endog_genes, ],</a> -<a class="sourceLine" id="cb293-4" data-line-number="4"> <span class="dt">exprs_values =</span> <span class="st">"logcounts_raw"</span>,</a> -<a class="sourceLine" id="cb293-5" data-line-number="5"> <span class="dt">perplexity =</span> <span class="dv">130</span></a> -<a class="sourceLine" id="cb293-6" data-line-number="6">)</a> -<a class="sourceLine" id="cb293-7" data-line-number="7"><span class="kw">plotTSNE</span>(</a> -<a class="sourceLine" id="cb293-8" data-line-number="8"> tmp,</a> -<a class="sourceLine" id="cb293-9" data-line-number="9"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> -<a class="sourceLine" id="cb293-10" data-line-number="10"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb293-11" data-line-number="11"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> -<a class="sourceLine" id="cb293-12" data-line-number="12">)</a></code></pre></div> +<div class="sourceCode" id="cb302"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb302-1" data-line-number="1"><span class="kw">set.seed</span>(<span class="dv">123456</span>)</a> +<a class="sourceLine" id="cb302-2" data-line-number="2">tmp <-<span class="st"> </span><span class="kw">runTSNE</span>(</a> +<a class="sourceLine" id="cb302-3" data-line-number="3"> reads[endog_genes, ],</a> +<a class="sourceLine" id="cb302-4" data-line-number="4"> <span class="dt">exprs_values =</span> <span class="st">"logcounts_raw"</span>,</a> +<a class="sourceLine" id="cb302-5" data-line-number="5"> <span class="dt">perplexity =</span> <span class="dv">130</span></a> +<a class="sourceLine" id="cb302-6" data-line-number="6">)</a> +<a class="sourceLine" id="cb302-7" data-line-number="7"><span class="kw">plotTSNE</span>(</a> +<a class="sourceLine" id="cb302-8" data-line-number="8"> tmp,</a> +<a class="sourceLine" id="cb302-9" data-line-number="9"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> +<a class="sourceLine" id="cb302-10" data-line-number="10"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb302-11" data-line-number="11"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> +<a class="sourceLine" id="cb302-12" data-line-number="12">)</a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:expr-overview-tsne-before-qc-reads"></span> <img src="data-viz-reads_files/figure-html/expr-overview-tsne-before-qc-reads-1.png" alt="tSNE map of the tung data" width="90%" /> <p class="caption"> Figure 6.26: tSNE map of the tung data </p> </div> -<div class="sourceCode" id="cb294"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb294-1" data-line-number="1"><span class="kw">set.seed</span>(<span class="dv">123456</span>)</a> -<a class="sourceLine" id="cb294-2" data-line-number="2">tmp <-<span class="st"> </span><span class="kw">runTSNE</span>(</a> -<a class="sourceLine" id="cb294-3" data-line-number="3"> reads.qc[endog_genes, ],</a> -<a class="sourceLine" id="cb294-4" data-line-number="4"> <span class="dt">exprs_values =</span> <span class="st">"logcounts_raw"</span>,</a> -<a class="sourceLine" id="cb294-5" data-line-number="5"> <span class="dt">perplexity =</span> <span class="dv">130</span></a> -<a class="sourceLine" id="cb294-6" data-line-number="6">)</a> -<a class="sourceLine" id="cb294-7" data-line-number="7"><span class="kw">plotTSNE</span>(</a> -<a class="sourceLine" id="cb294-8" data-line-number="8"> tmp,</a> -<a class="sourceLine" id="cb294-9" data-line-number="9"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> -<a class="sourceLine" id="cb294-10" data-line-number="10"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> -<a class="sourceLine" id="cb294-11" data-line-number="11"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> -<a class="sourceLine" id="cb294-12" data-line-number="12">)</a></code></pre></div> +<div class="sourceCode" id="cb303"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb303-1" data-line-number="1"><span class="kw">set.seed</span>(<span class="dv">123456</span>)</a> +<a class="sourceLine" id="cb303-2" data-line-number="2">tmp <-<span class="st"> </span><span class="kw">runTSNE</span>(</a> +<a class="sourceLine" id="cb303-3" data-line-number="3"> reads.qc[endog_genes, ],</a> +<a class="sourceLine" id="cb303-4" data-line-number="4"> <span class="dt">exprs_values =</span> <span class="st">"logcounts_raw"</span>,</a> +<a class="sourceLine" id="cb303-5" data-line-number="5"> <span class="dt">perplexity =</span> <span class="dv">130</span></a> +<a class="sourceLine" id="cb303-6" data-line-number="6">)</a> +<a class="sourceLine" id="cb303-7" data-line-number="7"><span class="kw">plotTSNE</span>(</a> +<a class="sourceLine" id="cb303-8" data-line-number="8"> tmp,</a> +<a class="sourceLine" id="cb303-9" data-line-number="9"> <span class="dt">colour_by =</span> <span class="st">"batch"</span>,</a> +<a class="sourceLine" id="cb303-10" data-line-number="10"> <span class="dt">size_by =</span> <span class="st">"total_features_by_counts"</span>,</a> +<a class="sourceLine" id="cb303-11" data-line-number="11"> <span class="dt">shape_by =</span> <span class="st">"individual"</span></a> +<a class="sourceLine" id="cb303-12" data-line-number="12">)</a></code></pre></div> <div class="figure" style="text-align: center"><span id="fig:expr-overview-tsne-after-qc-reads"></span> <img src="data-viz-reads_files/figure-html/expr-overview-tsne-after-qc-reads-1.png" alt="tSNE map of the tung data" width="90%" /> <p class="caption"> @@ -1806,7 +1839,7 @@ Figure 6.21: tSNE map of the tung data (perplexity = 10) Figure 6.22: tSNE map of the tung data (perplexity = 200) </p> </div> -<div class="sourceCode" id="cb295"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb295-1" data-line-number="1"><span class="kw">sessionInfo</span>()</a></code></pre></div> +<div class="sourceCode" id="cb304"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb304-1" data-line-number="1"><span class="kw">sessionInfo</span>()</a></code></pre></div> <pre><code>## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS diff --git a/public/references.html b/public/references.html index 8fcde27092d01c50a6287b8d7139fe177c92d6ca..de88f2555a2f3be5a97b1adc34281b3bf0ff82b6 100644 --- a/public/references.html +++ b/public/references.html @@ -339,7 +339,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.4.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-3"><i class="fa fa-check"></i><b>7.4.6</b> sessionInfo()</a></li> </ul></li> <li class="chapter" data-level="7.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#identifying-confounding-factors-reads"><i class="fa fa-check"></i><b>7.5</b> Identifying confounding factors (Reads)</a></li> -<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#dealing-with-confounders"><i class="fa fa-check"></i><b>7.6</b> Dealing with confounders</a><ul> +<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#batch-effects"><i class="fa fa-check"></i><b>7.6</b> Batch effects</a><ul> <li class="chapter" data-level="7.6.1" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#introduction-6"><i class="fa fa-check"></i><b>7.6.1</b> Introduction</a></li> <li class="chapter" data-level="7.6.2" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#linear-models"><i class="fa fa-check"></i><b>7.6.2</b> Linear models</a></li> <li class="chapter" data-level="7.6.3" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sctransform-2"><i class="fa fa-check"></i><b>7.6.3</b> sctransform</a></li> @@ -347,7 +347,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.6.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#combat"><i class="fa fa-check"></i><b>7.6.5</b> Combat</a></li> <li class="chapter" data-level="7.6.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#mnncorrect"><i class="fa fa-check"></i><b>7.6.6</b> mnnCorrect</a></li> <li class="chapter" data-level="7.6.7" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#harmony"><i class="fa fa-check"></i><b>7.6.7</b> Harmony</a></li> -<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-confounder-removal-strategies"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare confounder removal strategies</a></li> +<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-batch-correction"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare batch correction</a></li> <li class="chapter" data-level="7.6.9" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#big-exercise-2"><i class="fa fa-check"></i><b>7.6.9</b> Big Exercise</a></li> <li class="chapter" data-level="7.6.10" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-4"><i class="fa fa-check"></i><b>7.6.10</b> sessionInfo()</a></li> </ul></li> @@ -370,14 +370,13 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="9.1.2" data-path="latent-spaces.html"><a href="latent-spaces.html#tsne-t-distributed-stochastic-neighbor-embedding"><i class="fa fa-check"></i><b>9.1.2</b> tSNE: t-Distributed Stochastic Neighbor Embedding</a></li> <li class="chapter" data-level="9.1.3" data-path="latent-spaces.html"><a href="latent-spaces.html#manifold-methods"><i class="fa fa-check"></i><b>9.1.3</b> Manifold methods</a></li> </ul></li> -<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a></li> +<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a><ul> +<li class="chapter" data-level="9.2.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom-interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.2.1</b> <span>Slalom</span>: Interpretable latent spaces</a></li> +</ul></li> <li class="chapter" data-level="9.3" data-path="latent-spaces.html"><a href="latent-spaces.html#autoencoders"><i class="fa fa-check"></i><b>9.3</b> Autoencoders</a><ul> <li class="chapter" data-level="9.3.1" data-path="latent-spaces.html"><a href="latent-spaces.html#background-and-some-notations"><i class="fa fa-check"></i><b>9.3.1</b> Background and some notations</a></li> <li class="chapter" data-level="9.3.2" data-path="latent-spaces.html"><a href="latent-spaces.html#objective"><i class="fa fa-check"></i><b>9.3.2</b> Objective</a></li> </ul></li> -<li class="chapter" data-level="9.4" data-path="latent-spaces.html"><a href="latent-spaces.html#interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.4</b> Interpretable latent spaces</a><ul> -<li class="chapter" data-level="9.4.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom"><i class="fa fa-check"></i><b>9.4.1</b> Slalom</a></li> -</ul></li> </ul></li> <li class="chapter" data-level="10" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html"><i class="fa fa-check"></i><b>10</b> Clustering and cell annotation</a><ul> <li class="chapter" data-level="10.1" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html#clustering-methods"><i class="fa fa-check"></i><b>10.1</b> Clustering Methods</a><ul> @@ -399,15 +398,16 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="11.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#first-look-at-deng-data"><i class="fa fa-check"></i><b>11.1</b> First look at Deng data</a><ul> <li class="chapter" data-level="11.1.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#tscan"><i class="fa fa-check"></i><b>11.1.1</b> TSCAN</a></li> <li class="chapter" data-level="11.1.2" data-path="trajectory-inference.html"><a href="trajectory-inference.html#slingshot"><i class="fa fa-check"></i><b>11.1.2</b> Slingshot</a></li> -<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.3</b> Monocle</a></li> -<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.4</b> Monocle 2</a></li> -<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 3</a></li> -<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.6</b> Diffusion maps</a></li> -<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.7</b> Other methods</a></li> -<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.8</b> Comparison of the methods</a></li> -<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.9</b> Expression of genes through time</a></li> -<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.10</b> dynverse</a></li> -<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.11</b> sessionInfo()</a></li> +<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#gam-general-additive-model-for-identifying-temporally-expressed-genes"><i class="fa fa-check"></i><b>11.1.3</b> GAM general additive model for identifying temporally expressed genes</a></li> +<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.4</b> Monocle</a></li> +<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 2</a></li> +<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.6</b> Monocle 3</a></li> +<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.7</b> Diffusion maps</a></li> +<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.8</b> Other methods</a></li> +<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.9</b> Comparison of the methods</a></li> +<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.10</b> Expression of genes through time</a></li> +<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.11</b> dynverse</a></li> +<li class="chapter" data-level="11.1.12" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.12</b> sessionInfo()</a></li> </ul></li> </ul></li> <li class="chapter" data-level="12" data-path="dechapter.html"><a href="dechapter.html"><i class="fa fa-check"></i><b>12</b> Differential Expression (DE) analysis</a><ul> @@ -521,9 +521,15 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <p>Archer, Nathan, Mark D. Walsh, Vahid Shahrezaei, and Daniel Hebenstreit. 2016. “Modeling Enzyme Processivity Reveals That RNA-Seq Libraries Are Biased in Characteristic and Correctable Ways.†<em>Cell Systems</em> 3 (5). Elsevier BV: 467–479.e12. <a href="https://doi.org/10.1016/j.cels.2016.10.012">https://doi.org/10.1016/j.cels.2016.10.012</a>.</p> </div> <div> +<p>Blondel, Vincent D, Jean-Loup Guillaume, Renaud Lambiotte, and Etienne Lefebvre. 2008. “Fast Unfolding of Communities in Large Networks.†<em>Journal of Statistical Mechanics: Theory and Experiment</em> 2008 (10). IOP Publishing: P10008.</p> +</div> +<div> <p>Bray, Nicolas L, Harold Pimentel, Páll Melsted, and Lior Pachter. 2016. “Near-Optimal Probabilistic Rna-Seq Quantification.†<em>Nat Biotechnol</em> 34 (5): 525–27. <a href="https://doi.org/10.1038/nbt.3519">https://doi.org/10.1038/nbt.3519</a>.</p> </div> <div> +<p>Buettner, Florian, Naruemon Pratanwanich, Davis J McCarthy, John C Marioni, and Oliver Stegle. 2017. “F-scLVM: Scalable and Versatile Factor Analysis for Single-Cell Rna-Seq.†<em>Genome Biology</em> 18 (1). BioMed Central: 212.</p> +</div> +<div> <p>Bullard, James H, Elizabeth Purdom, Kasper D Hansen, and Sandrine Dudoit. 2010. “Evaluation of Statistical Methods for Normalization and Differential Expression in mRNA-Seq Experiments.†<em>BMC Bioinformatics</em> 11 (1). Springer Nature: 94. <a href="https://doi.org/10.1186/1471-2105-11-94">https://doi.org/10.1186/1471-2105-11-94</a>.</p> </div> <div> @@ -536,15 +542,24 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <p>Coifman, Ronald R, and Stéphane Lafon. 2006. “Diffusion Maps.†<em>Appl. Comput. Harmon. Anal.</em> 21 (1): 5–30.</p> </div> <div> +<p>Collins, Michael, Sanjoy Dasgupta, and Robert E Schapire. 2002. “A Generalization of Principal Components Analysis to the Exponential Family.†In <em>Advances in Neural Information Processing Systems</em>, 617–24.</p> +</div> +<div> <p>Deng, Q., D. Ramskold, B. Reinius, and R. Sandberg. 2014. “Single-Cell RNA-Seq Reveals Dynamic, Random Monoallelic Gene Expression in Mammalian Cells.†<em>Science</em> 343 (6167). American Association for the Advancement of Science (AAAS): 193–96. <a href="https://doi.org/10.1126/science.1245316">https://doi.org/10.1126/science.1245316</a>.</p> </div> <div> <p>Dobin, Alexander, Carrie A Davis, Felix Schlesinger, Jorg Drenkow, Chris Zaleski, Sonali Jha, Philippe Batut, Mark Chaisson, and Thomas R Gingeras. 2013. “STAR: Ultrafast Universal Rna-Seq Aligner.†<em>Bioinformatics</em> 29 (1): 15–21. <a href="https://doi.org/10.1093/bioinformatics/bts635">https://doi.org/10.1093/bioinformatics/bts635</a>.</p> </div> <div> +<p>Freytag, Saskia, Luyi Tian, Ingrid Lönnstedt, Milica Ng, and Melanie Bahlo. 2018. “Comparison of Clustering Tools in R for Medium-Sized 10x Genomics Single-Cell Rna-Sequencing Data.†<em>F1000Research</em> 7. Faculty of 1000 Ltd.</p> +</div> +<div> <p>Gierahn, Todd M, Marc H Wadsworth 2nd, Travis K Hughes, Bryan D Bryson, Andrew Butler, Rahul Satija, Sarah Fortune, J Christopher Love, and Alex K Shalek. 2017. “Seq-Well: Portable, Low-Cost RNA Sequencing of Single Cells at High Throughput.†<em>Nat. Methods</em> 14 (4): 395–98.</p> </div> <div> +<p>Good, Benjamin H, Yves-Alexandre De Montjoye, and Aaron Clauset. 2010. “Performance of Modularity Maximization in Practical Contexts.†<em>Physical Review E</em> 81 (4). APS: 046106.</p> +</div> +<div> <p>Hafemeister, Christoph, and Rahul Satija. 2019. “Normalization and variance stabilization of single-cell RNA-seq data using regularized negative binomial regression.†<em>bioRxiv</em>. <a href="https://doi.org/10.1101/576827">https://doi.org/10.1101/576827</a>.</p> </div> <div> @@ -560,6 +575,9 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <p>Hastie, Trevor, and Werner Stuetzle. 1989. “Principal Curves.†AT&T Bell Laboratorie,Murray Hill; Journal of the American Statistical Association.</p> </div> <div> +<p>Hinton, Geoffrey E, and Sam T Roweis. 2003. “Stochastic Neighbor Embedding.†In <em>Advances in Neural Information Processing Systems</em>, 857–64.</p> +</div> +<div> <p>Islam, Saiful, Amit Zeisel, Simon Joost, Gioele La Manno, Pawel Zajac, Maria Kasper, Peter Lönnerberg, and Sten Linnarsson. 2013. “Quantitative Single-Cell RNA-Seq with Unique Molecular Identifiers.†<em>Nat Meth</em> 11 (2). Springer Nature: 163–66. <a href="https://doi.org/10.1038/nmeth.2772">https://doi.org/10.1038/nmeth.2772</a>.</p> </div> <div> @@ -572,6 +590,9 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <p>Kharchenko, Peter V, Lev Silberstein, and David T Scadden. 2014. “Bayesian Approach to Single-Cell Differential Expression Analysis.†<em>Nat Meth</em> 11 (7). Springer Nature: 740–42. <a href="https://doi.org/10.1038/nmeth.2967">https://doi.org/10.1038/nmeth.2967</a>.</p> </div> <div> +<p>Kingma, Diederik P, and Max Welling. 2013. “Auto-Encoding Variational Bayes.†<em>arXiv Preprint arXiv:1312.6114</em>.</p> +</div> +<div> <p>Kiselev, Vladimir Yu, and Martin Hemberg. 2017. “Scmap - a Tool for Unsupervised Projection of Single Cell RNA-seq Data.†<em>bioRxiv</em>, July, 150292.</p> </div> <div> @@ -593,15 +614,27 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <p>L. Lun, Aaron T., Karsten Bach, and John C. Marioni. 2016. “Pooling Across Cells to Normalize Single-Cell RNA Sequencing Data with Many Zero Counts.†<em>Genome Biol</em> 17 (1). Springer Nature. <a href="https://doi.org/10.1186/s13059-016-0947-7">https://doi.org/10.1186/s13059-016-0947-7</a>.</p> </div> <div> +<p>Maaten, Laurens van der, and Geoffrey Hinton. 2008. “Visualizing Data Using T-Sne.†<em>Journal of Machine Learning Research</em> 9 (Nov): 2579–2605.</p> +</div> +<div> <p>Macosko, Evan Z., Anindita Basu, Rahul Satija, James Nemesh, Karthik Shekhar, Melissa Goldman, Itay Tirosh, et al. 2015. “Highly Parallel Genome-Wide Expression Profiling of Individual Cells Using Nanoliter Droplets.†<em>Cell</em> 161 (5). Elsevier BV: 1202–14. <a href="https://doi.org/10.1016/j.cell.2015.05.002">https://doi.org/10.1016/j.cell.2015.05.002</a>.</p> </div> <div> <p>McCarthy, Davis J., Kieran R. Campbell, Aaron T. L. Lun, and Quin F. Wills. 2017. “Scater: Pre-processing, Quality Control, Normalization and Visualization of Single-Cell RNA-Seq Data in R.†<em>Bioinformatics</em>, January. Oxford University Press (OUP), btw777. <a href="https://doi.org/10.1093/bioinformatics/btw777">https://doi.org/10.1093/bioinformatics/btw777</a>.</p> </div> <div> +<p>McInnes, Leland, John Healy, and James Melville. 2018. “Umap: Uniform Manifold Approximation and Projection for Dimension Reduction.†<em>arXiv Preprint arXiv:1802.03426</em>.</p> +</div> +<div> +<p>Moon, Kevin R, David van Dijk, Zheng Wang, William Chen, Matthew J Hirn, Ronald R Coifman, Natalia B Ivanova, Guy Wolf, and Smita Krishnaswamy. 2017. “PHATE: A Dimensionality Reduction Method for Visualizing Trajectory Structures in High-Dimensional Biological Data.†<em>bioRxiv</em>. Cold Spring Harbor Laboratory, 120378.</p> +</div> +<div> <p>Muraro, Mauro J., Gitanjali Dharmadhikari, Dominic Grün, Nathalie Groen, Tim Dielen, Erik Jansen, Leon van Gurp, et al. 2016. “A Single-Cell Transcriptome Atlas of the Human Pancreas.†<em>Cell Systems</em> 3 (4). Elsevier BV: 385–394.e3. <a href="https://doi.org/10.1016/j.cels.2016.09.002">https://doi.org/10.1016/j.cels.2016.09.002</a>.</p> </div> <div> +<p>Newman, Mark EJ, and Michelle Girvan. 2004. “Finding and Evaluating Community Structure in Networks.†<em>Physical Review E</em> 69 (2). APS: 026113.</p> +</div> +<div> <p>Picelli, Simone, Ã…sa K Björklund, Omid R Faridani, Sven Sagasser, Gösta Winberg, and Rickard Sandberg. 2013. “Smart-Seq2 for Sensitive Full-Length Transcriptome Profiling in Single Cells.†<em>Nat Meth</em> 10 (11). Springer Nature: 1096–8. <a href="https://doi.org/10.1038/nmeth.2639">https://doi.org/10.1038/nmeth.2639</a>.</p> </div> <div> @@ -641,6 +674,12 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <p>Tang, Fuchou, Catalin Barbacioru, Yangzhou Wang, Ellen Nordman, Clarence Lee, Nanlan Xu, Xiaohui Wang, et al. 2009. “mRNA-Seq Whole-Transcriptome Analysis of a Single Cell.†<em>Nat Meth</em> 6 (5). Springer Nature: 377–82. <a href="https://doi.org/10.1038/nmeth.1315">https://doi.org/10.1038/nmeth.1315</a>.</p> </div> <div> +<p>Townes, F William, Stephanie C Hicks, Martin J Aryee, and Rafael A Irizarry. 2019. “Feature Selection and Dimension Reduction for Single Cell Rna-Seq Based on a Multinomial Model.†<em>bioRxiv</em>. Cold Spring Harbor Laboratory, 574574.</p> +</div> +<div> +<p>Traag, Vincent A, Ludo Waltman, and Nees Jan van Eck. 2019. “From Louvain to Leiden: Guaranteeing Well-Connected Communities.†<em>Scientific Reports</em> 9. Nature Publishing Group.</p> +</div> +<div> <p>Trapnell, Cole, Davide Cacchiarelli, Jonna Grimsby, Prapti Pokharel, Shuqiang Li, Michael Morse, Niall J Lennon, Kenneth J Livak, Tarjei S Mikkelsen, and John L Rinn. 2014. “The Dynamics and Regulators of Cell Fate Decisions Are Revealed by Pseudotemporal Ordering of Single Cells.†<em>Nat. Biotechnol.</em> 32 (4): 381–86.</p> </div> <div> @@ -656,6 +695,9 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <p>William Townes, F, Stephanie C Hicks, Martin J Aryee, and Rafael A Irizarry. 2019. “Feature Selection and Dimension Reduction for Single Cell RNA-Seq based on a Multinomial Model.†<em>bioRxiv</em>. <a href="https://doi.org/10.1101/574574">https://doi.org/10.1101/574574</a>.</p> </div> <div> +<p>Xu, Chen, and Zhengchang Su. 2015. “Identification of Cell Types from Single-Cell Transcriptomes Using a Novel Clustering Method.†<em>Bioinformatics</em> 31 (12). Oxford University Press: 1974–80.</p> +</div> +<div> <p>Ziegenhain, Christoph, Beate Vieth, Swati Parekh, Björn Reinius, Amy Guillaumet-Adkins, Martha Smets, Heinrich Leonhardt, Holger Heyn, Ines Hellmann, and Wolfgang Enard. 2017. “Comparative Analysis of Single-Cell RNA Sequencing Methods.†<em>Molecular Cell</em> 65 (4). Elsevier BV: 631–643.e4. <a href="https://doi.org/10.1016/j.molcel.2017.01.023">https://doi.org/10.1016/j.molcel.2017.01.023</a>.</p> </div> </div> diff --git a/public/remove-conf.md b/public/remove-conf.md index 10ada6f16d20f385dd7bf4c175c0dc32535cfe90..480ed1b947eeb5764e4fcad6480f0c2c30204241 100644 --- a/public/remove-conf.md +++ b/public/remove-conf.md @@ -4,7 +4,7 @@ output: html_document -## Dealing with confounders +## Batch effects ### Introduction @@ -160,14 +160,3819 @@ Perform LM correction for each individual separately. Store the final corrected matrix in the `lm_batch_indi` slot. +```r +## define cellular detection rate (cdr), i.e. proportion of genes expressed in each cell +umi.qc$cdr <- umi.qc$total_features_by_counts_endogenous / nrow(umi.qc) +## fit a model just accounting for batch by individual +lm_design_batch1 <- model.matrix(~batch + cdr, + data = colData(umi.qc)[umi.qc$individual == "na19098",]) +fit_indi1 <- lmfit(logcounts(umi.qc)[, umi.qc$individual == "na19098"], lm_design_batch1) +fit_indi1$coefficients[,1] <- 0 ## replace intercept with 0 to preserve reference batch +resids_lm_batch1 <- residuals(fit_indi1, logcounts(umi.qc)[, umi.qc$individual == "na19098"]) + +lm_design_batch2 <- model.matrix(~batch + cdr, + data = colData(umi.qc)[umi.qc$individual == "na19101",]) +fit_indi2 <- lmfit(logcounts(umi.qc)[, umi.qc$individual == "na19101"], lm_design_batch2) +fit_indi2$coefficients[,1] <- 0 ## replace intercept with 0 to preserve reference batch +resids_lm_batch2 <- residuals(fit_indi2, logcounts(umi.qc)[, umi.qc$individual == "na19101"]) + +lm_design_batch3 <- model.matrix(~batch + cdr, + data = colData(umi.qc)[umi.qc$individual == "na19239",]) +fit_indi3 <- lmfit(logcounts(umi.qc)[, umi.qc$individual == "na19239"], lm_design_batch3) +fit_indi3$coefficients[,1] <- 0 ## replace intercept with 0 to preserve reference batch +resids_lm_batch3 <- residuals(fit_indi3, logcounts(umi.qc)[, umi.qc$individual == "na19239"]) + +identical(colnames(umi.qc), colnames(cbind(resids_lm_batch1, resids_lm_batch2, resids_lm_batch3))) +assay(umi.qc, "lm_batch_indi") <- cbind(resids_lm_batch1, resids_lm_batch2, resids_lm_batch3) + +reduceddim(umi.qc, "pca_lm_batch_indi") <- reduceddim( + runpca(umi.qc[endog_genes, ], exprs_values = "lm_batch_indi"), "pca") + +plotreduceddim(umi.qc, use_dimred = "pca_lm_batch_indi", + colour_by = "batch", + size_by = "total_features_by_counts", + shape_by = "individual" + ) + + ggtitle("lm - regress out batch within individuals separately") +``` + +What do you think of the results of this approach? + +#### Negative binomial generalized linear models + + +### sctransform + +The `sctransform` approach to using Pearson residuals from an regularized +negative binomial generalized linear model was introduced above. Here we +demonstrate how to apply this method. + +Note that (due to what looks like a bug in this version of `sctransform`) we +need to convert the UMI count matrix to a sparse format to apply sctransform. + +These `sctransform` results will face the problem mentioned above of batch being +nested within individual, which means that we cannot directly remove batch +effects without removing differences between individuals. However, here we will +demonstrate how you *would* try to remove batch effects with `sctransform` for a +kinder experimental design. + + +```r +umi_sparse <- as(counts(umi.qc), "dgCMatrix") +### Genes expressed in at least 5 cells will be kept +sctnorm_data <- sctransform::vst(umi = umi_sparse, min_cells = 1, + cell_attr = as.data.frame(colData(umi.qc)), + latent_var = c("log10_total_counts_endogenous", "batch")) +``` + +``` +## Calculating cell attributes for input UMI matrix +``` + +``` +## Variance stabilizing transformation of count matrix of size 14066 by 657 +``` + +``` +## Model formula is y ~ log10_total_counts_endogenous + batch +``` + +``` +## Get Negative Binomial regression parameters per gene +``` + +``` +## Using 2000 genes, 657 cells +``` + +``` +## | | | 0% +``` + +``` +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +``` + +``` +## Warning in sqrt(1/i): NaNs produced +``` + +``` +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +``` + +``` +## Warning in sqrt(1/i): NaNs produced +``` + +``` +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +``` + +``` +## | |======== | 12% +``` + +``` +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +``` + +``` +## | |================ | 25% +``` + +``` +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +``` + +``` +## | |======================== | 38% +``` + +``` +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +``` + +``` +## | |================================ | 50% +``` + +``` +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +``` + +``` +## | |========================================= | 62% +``` + +``` +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +``` + +``` +## | |================================================= | 75% +``` + +``` +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +``` + +``` +## | |========================================================= | 88% +``` + +``` +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -What do you think of the results of this approach? +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -#### Negative binomial generalized linear models +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached -### sctransform +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached + +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached +``` + +``` +## | |=================================================================| 100% +``` + +``` +## Found 123 outliers - those will be ignored in fitting/regularization step +``` + +``` +## Second step: Get residuals using fitted parameters for 14066 genes +``` + +``` +## | | | 0% | |= | 2% | |== | 4% | |==== | 5% | |===== | 7% | |====== | 9% | |======= | 11% | |======== | 13% | |========= | 15% | |=========== | 16% | |============ | 18% | |============= | 20% | |============== | 22% | |=============== | 24% | |================= | 25% | |================== | 27% | |=================== | 29% | |==================== | 31% | |===================== | 33% | |====================== | 35% | |======================== | 36% | |========================= | 38% | |========================== | 40% | |=========================== | 42% | |============================ | 44% | |============================== | 45% | |=============================== | 47% | |================================ | 49% | |================================= | 51% | |================================== | 53% | |=================================== | 55% | |===================================== | 56% | |====================================== | 58% | |======================================= | 60% | |======================================== | 62% | |========================================= | 64% | |=========================================== | 65% | |============================================ | 67% | |============================================= | 69% | |============================================== | 71% | |=============================================== | 73% | |================================================ | 75% | |================================================== | 76% | |=================================================== | 78% | |==================================================== | 80% | |===================================================== | 82% | |====================================================== | 84% | |======================================================== | 85% | |========================================================= | 87% | |========================================================== | 89% | |=========================================================== | 91% | |============================================================ | 93% | |============================================================= | 95% | |=============================================================== | 96% | |================================================================ | 98% | |=================================================================| 100% +``` + +``` +## Calculating gene attributes +``` + +``` +## Wall clock passed: Time difference of 28.12818 secs +``` + +```r +## Pearson residuals, or deviance residuals +sctnorm_data$model_str +``` + +``` +## [1] "y ~ log10_total_counts_endogenous + batch" +``` + +```r +assay(umi.qc, "sctrans_norm") <- sctnorm_data$y +``` + +Let us look at the NB GLM model parameters estimated by sctransform. + + +```r +#sce$log10_total_counts +## Matrix of estimated model parameters per gene (theta and regression coefficients) +sctransform::plot_model_pars(sctnorm_data) +``` + +<img src="remove-conf_files/figure-html/sctransform-params-plot-1.png" width="90%" style="display: block; margin: auto;" /> + +Do these parameters and the regularization look sensible to you? Any concerns? + + +```r +reducedDim(umi.qc, "PCA_sctrans_norm") <- reducedDim( + runPCA(umi.qc[endog_genes, ], exprs_values = "sctrans_norm") +) +plotReducedDim( + umi.qc, + use_dimred = "PCA_sctrans_norm", + colour_by = "batch", + size_by = "total_features_by_counts", + shape_by = "individual" +) + ggtitle("PCA plot: sctransform normalization") +``` + +<div class="figure" style="text-align: center"> +<img src="remove-conf_files/figure-html/norm-pca-sctransform-1.png" alt="PCA plot of the tung data after sctransform normalisation (Pearson residuals)." width="90%" /> +<p class="caption">(\#fig:norm-pca-sctransform)PCA plot of the tung data after sctransform normalisation (Pearson residuals).</p> +</div> + +**Q:** What's happened here? Was that expected? Any other comments? ### Remove Unwanted Variation @@ -190,10 +3995,12 @@ $W$, $\alpha$, $\beta$, and $k$ is infeasible. For a given $k$, instead the following three approaches to estimate the factors of unwanted variation $W$ are used: -* _RUVg_ uses negative control genes (e.g. ERCCs), assumed to have constant expression across samples; -* _RUVs_ uses centered (technical) replicate/negative control samples for which the covariates of interest are -constant; -* _RUVr_ uses residuals, e.g., from a first-pass GLM regression of the counts on the covariates of interest. +* _RUVg_ uses negative control genes (e.g. ERCCs), assumed to have constant + expression across samples; +* _RUVs_ uses centered (technical) replicate/negative control samples for which +the covariates of interest are constant; +* _RUVr_ uses residuals, e.g., from a first-pass GLM regression of the counts on + the covariates of interest. We will concentrate on the first two approaches. @@ -255,7 +4062,12 @@ assay(umi.qc, "ruvs10") <- log2( ### Combat -If you have an experiment with a balanced design, `Combat` can be used to eliminate batch effects while preserving biological effects by specifying the biological effects using the `mod` parameter. However the `Tung` data contains multiple experimental replicates rather than a balanced design so using `mod1` to preserve biological variability will result in an error. +If you have an experiment with a balanced design, `Combat` can be used to +eliminate batch effects while preserving biological effects by specifying the +biological effects using the `mod` parameter. However the `Tung` data contains +multiple experimental replicates rather than a balanced design so using `mod1` +to preserve biological variability will result in an error. + ```r combat_data <- logcounts(umi.qc) @@ -275,6 +4087,10 @@ assay(umi.qc, "combat") <- ComBat( ) ``` +``` +## Standardizing Data across genes +``` + __Exercise 1__ Perform `ComBat` correction accounting for total features as a co-variate. Store the corrected matrix in the `combat_tf` slot. @@ -326,6 +4142,13 @@ indi2 <- do_mnn(umi.qc[, umi.qc$individual == "NA19101"]) indi3 <- do_mnn(umi.qc[, umi.qc$individual == "NA19239"]) identical(colnames(umi.qc), colnames(cbind(indi1, indi2, indi3))) +``` + +``` +## [1] TRUE +``` + +```r assay(umi.qc, "mnn") <- assay(cbind(indi1, indi2, indi3), "corrected") # For a balanced design: @@ -353,25 +4176,62 @@ actually be slower on small datasets like that considered here.] indi1 <- batchelor::fastMNN( umi.qc[, umi.qc$individual == "NA19098"], batch = umi.qc[, umi.qc$individual == "NA19098"]$replicate) +``` + +``` +## Warning in sweep(centered, 2, w, "/", check.margin = FALSE): 'check.margin' is ignored when 'x' is a DelayedArray object or +## derivative +``` + +```r indi2 <- batchelor::fastMNN( umi.qc[, umi.qc$individual == "NA19101"], batch = umi.qc[, umi.qc$individual == "NA19101"]$replicate) +``` + +``` +## Warning in sweep(centered, 2, w, "/", check.margin = FALSE): 'check.margin' is ignored when 'x' is a DelayedArray object or +## derivative +``` + +```r indi3 <- batchelor::fastMNN( umi.qc[, umi.qc$individual == "NA19239"], batch = umi.qc[, umi.qc$individual == "NA19239"]$replicate) +``` + +``` +## Warning in sweep(centered, 2, w, "/", check.margin = FALSE): 'check.margin' is ignored when 'x' is a DelayedArray object or +## derivative +``` +```r identical(colnames(umi.qc), colnames(cbind(assay(indi1, "reconstructed"), assay(indi2, "reconstructed"), assay(indi3, "reconstructed")))) +``` + +``` +## [1] TRUE +``` + +```r fastmnn <- cbind(assay(indi1, "reconstructed"), assay(indi2, "reconstructed"), assay(indi3, "reconstructed")) identical(rownames(umi.qc), rownames(fastmnn)) +``` + +``` +## [1] FALSE +``` + +```r ## fastMNN() drops 66 genes, so we cannot immediately add the reconstructed expression matrix to assays() in umi.qc ## But we can run PCA on the reconstructed data from fastMNN() and add that to the reducedDim slot of our SCE object -fastmnn_pca <- runPCA(fastmnn) -reducedDim(umi.qc, "fastmnn") <- fastmnn_pca +fastmnn_pca <- runPCA(fastmnn, rank=2) +reducedDim(umi.qc, "fastmnn") <- fastmnn_pca$rotation ``` For further details, please consult the `batchelor` package documentation and @@ -380,9 +4240,19 @@ For further details, please consult the `batchelor` package documentation and ### Harmony -Harmony [Korsunsky2018fast] is a newer batch correction method, which is designed to operate on PC space. The algorithm proceeds to iteratively cluster the cells, with the objective function formulated to promote cells from multiple datasets within each cluster. Once a clustering is obtained, the positions of the centroids of each dataset are obtained on a per-cluster basis and the coordinates are corrected. This procedure is iterated until convergence. Harmony comes with a `theta` parameter that controls the degree of batch correction (higher values lead to more dataset integration), and can account for multiple experimental and biological factors on input. +Harmony [Korsunsky2018fast] is a newer batch correction method, which is +designed to operate on PC space. The algorithm proceeds to iteratively cluster +the cells, with the objective function formulated to promote cells from multiple +datasets within each cluster. Once a clustering is obtained, the positions of +the centroids of each dataset are obtained on a per-cluster basis and the +coordinates are corrected. This procedure is iterated until convergence. Harmony +comes with a `theta` parameter that controls the degree of batch correction +(higher values lead to more dataset integration), and can account for multiple +experimental and biological factors on input. -Seeing how the end result of Harmony is an altered dimensional reduction space created on the basis of PCA, we plot the obtained manifold here and exclude it from the rest of the follow-ups in the section. +Seeing how the end result of Harmony is an altered dimensional reduction space +created on the basis of PCA, we plot the obtained manifold here and exclude it +from the rest of the follow-ups in the section. ```r @@ -390,6 +4260,49 @@ umi.qc.endog <- umi.qc[endog_genes,] umi.qc.endog <- runPCA(umi.qc.endog, exprs_values = 'logcounts', ncomponents = 20) pca <- as.matrix(reducedDim(umi.qc.endog, "PCA")) harmony_emb <- HarmonyMatrix(pca, umi.qc.endog$batch, theta=2, do_pca=FALSE) +``` + +``` +## Harmony 1/10 +``` + +``` +## Harmony 2/10 +``` + +``` +## Harmony 3/10 +``` + +``` +## Harmony 4/10 +``` + +``` +## Harmony 5/10 +``` + +``` +## Harmony 6/10 +``` + +``` +## Harmony 7/10 +``` + +``` +## Harmony 8/10 +``` + +``` +## Harmony 9/10 +``` + +``` +## Harmony 10/10 +``` + +```r reducedDim(umi.qc.endog, "harmony") <- harmony_emb plotReducedDim( @@ -399,10 +4312,15 @@ plotReducedDim( size_by = "total_features_by_counts", shape_by = "individual" ) +``` + +<img src="remove-conf_files/figure-html/harmony-1.png" width="90%" style="display: block; margin: auto;" /> + +```r reducedDim(umi.qc, "harmony") <- reducedDim(umi.qc.endog, "harmony") ``` -### How to evaluate and compare confounder removal strategies +### How to evaluate and compare batch correction A key question when considering the different methods for removing confounders is how to quantitatively determine which one is the most effective. The main @@ -432,8 +4350,22 @@ for (nm in assayNames(umi.qc)) { ) reducedDim(umi.qc, paste0("PCA_", nm)) <- reducedDim(tmp, "PCA") } +``` +``` +## counts +## logcounts_raw +## logcounts +## sctrans_norm +## ruvg1 +## ruvg10 +## ruvs1 +## ruvs10 +## combat +## mnn +``` +```r for (nm in reducedDimNames(umi.qc)) { print( plotReducedDim( @@ -448,9 +4380,11 @@ for (nm in reducedDimNames(umi.qc)) { } ``` +<img src="remove-conf_files/figure-html/effectiveness1-1.png" width="90%" style="display: block; margin: auto;" /><img src="remove-conf_files/figure-html/effectiveness1-2.png" width="90%" style="display: block; margin: auto;" /><img src="remove-conf_files/figure-html/effectiveness1-3.png" width="90%" style="display: block; margin: auto;" /><img src="remove-conf_files/figure-html/effectiveness1-4.png" width="90%" style="display: block; margin: auto;" /><img src="remove-conf_files/figure-html/effectiveness1-5.png" width="90%" style="display: block; margin: auto;" /><img src="remove-conf_files/figure-html/effectiveness1-6.png" width="90%" style="display: block; margin: auto;" /><img src="remove-conf_files/figure-html/effectiveness1-7.png" width="90%" style="display: block; margin: auto;" /><img src="remove-conf_files/figure-html/effectiveness1-8.png" width="90%" style="display: block; margin: auto;" /><img src="remove-conf_files/figure-html/effectiveness1-9.png" width="90%" style="display: block; margin: auto;" /><img src="remove-conf_files/figure-html/effectiveness1-10.png" width="90%" style="display: block; margin: auto;" /><img src="remove-conf_files/figure-html/effectiveness1-11.png" width="90%" style="display: block; margin: auto;" /><img src="remove-conf_files/figure-html/effectiveness1-12.png" width="90%" style="display: block; margin: auto;" /> + __Exercise 3__ -Consider different `ks` for RUV normalizations. Which gives the best results? +Consider different `k`'s for RUV normalizations. Which gives the best results? #### Effectiveness 2 @@ -470,6 +4404,8 @@ par(mar=c(6,4,1,1)) boxplot(res, las=2) ``` +<img src="remove-conf_files/figure-html/effectiveness2-1.png" width="90%" style="display: block; margin: auto;" /> + #### Effectiveness 3 Another method to check the efficacy of batch-effect correction is to consider @@ -546,15 +4482,115 @@ ggplot(dod, aes(Normalisation, Individual, fill=kBET)) + ggtitle("Effect of batch regression methods per individual") ``` +<img src="remove-conf_files/figure-html/kbet-1.png" width="90%" style="display: block; margin: auto;" /> + __Exercise 4__ Why do the raw counts appear to have little batch effects? ### Big Exercise -Perform the same analysis with read counts of the `tung` data. Use `tung/reads.rds` file to load the reads `SCE` object. Once you have finished please compare your results to ours (next chapter). Additionally, experiment with other combinations of normalizations and compare the results. +Perform the same analysis with read counts of the `tung` data. Use +`tung/reads.rds` file to load the reads `SCE` object. Once you have finished +please compare your results to ours (next chapter). Additionally, experiment +with other combinations of normalizations and compare the results. ### sessionInfo() +``` +## R version 3.6.0 (2019-04-26) +## Platform: x86_64-pc-linux-gnu (64-bit) +## Running under: Ubuntu 18.04.3 LTS +## +## Matrix products: default +## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 +## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 +## +## locale: +## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C +## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 +## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 +## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C +## [9] LC_ADDRESS=C LC_TELEPHONE=C +## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C +## +## attached base packages: +## [1] stats4 parallel stats graphics grDevices utils datasets +## [8] methods base +## +## other attached packages: +## [1] RColorBrewer_1.1-2 reshape2_1.4.3 +## [3] harmony_1.0 Rcpp_1.0.2 +## [5] sva_3.32.1 genefilter_1.66.0 +## [7] mgcv_1.8-28 nlme_3.1-139 +## [9] kBET_0.99.6 scran_1.12.1 +## [11] scater_1.12.2 ggplot2_3.2.1 +## [13] SingleCellExperiment_1.6.0 RUVSeq_1.18.0 +## [15] edgeR_3.26.8 limma_3.40.6 +## [17] EDASeq_2.18.0 ShortRead_1.42.0 +## [19] GenomicAlignments_1.20.1 SummarizedExperiment_1.14.1 +## [21] DelayedArray_0.10.0 matrixStats_0.55.0 +## [23] Rsamtools_2.0.1 GenomicRanges_1.36.1 +## [25] GenomeInfoDb_1.20.0 Biostrings_2.52.0 +## [27] XVector_0.24.0 IRanges_2.18.3 +## [29] S4Vectors_0.22.1 BiocParallel_1.18.1 +## [31] Biobase_2.44.0 BiocGenerics_0.30.0 +## [33] scRNA.seq.funcs_0.1.0 +## +## loaded via a namespace (and not attached): +## [1] backports_1.1.4 aroma.light_3.14.0 +## [3] plyr_1.8.4 igraph_1.2.4.1 +## [5] lazyeval_0.2.2 splines_3.6.0 +## [7] listenv_0.7.0 elliptic_1.4-0 +## [9] digest_0.6.21 htmltools_0.3.6 +## [11] viridis_0.5.1 magrittr_1.5 +## [13] memoise_1.1.0 contfrac_1.1-12 +## [15] cluster_2.1.0 globals_0.12.4 +## [17] annotate_1.62.0 R.utils_2.9.0 +## [19] prettyunits_1.0.2 colorspace_1.4-1 +## [21] blob_1.2.0 xfun_0.9 +## [23] dplyr_0.8.3 crayon_1.3.4 +## [25] RCurl_1.95-4.12 zeallot_0.1.0 +## [27] survival_2.43-3 glue_1.3.1 +## [29] gtable_0.3.0 zlibbioc_1.30.0 +## [31] BiocSingular_1.0.0 future.apply_1.3.0 +## [33] scales_1.0.0 DESeq_1.36.0 +## [35] DBI_1.0.0 viridisLite_0.3.0 +## [37] xtable_1.8-4 progress_1.2.2 +## [39] dqrng_0.2.1 bit_1.1-14 +## [41] rsvd_1.0.2 deSolve_1.24 +## [43] httr_1.4.1 FNN_1.1.3 +## [45] pkgconfig_2.0.3 XML_3.98-1.20 +## [47] R.methodsS3_1.7.1 locfit_1.5-9.1 +## [49] dynamicTreeCut_1.63-1 tidyselect_0.2.5 +## [51] labeling_0.3 rlang_0.4.0 +## [53] AnnotationDbi_1.46.1 munsell_0.5.0 +## [55] tools_3.6.0 moments_0.14 +## [57] RSQLite_2.1.2 batchelor_1.0.1 +## [59] evaluate_0.14 stringr_1.4.0 +## [61] yaml_2.2.0 knitr_1.25 +## [63] bit64_0.9-7 hypergeo_1.2-13 +## [65] purrr_0.3.2 future_1.14.0 +## [67] R.oo_1.22.0 biomaRt_2.40.4 +## [69] compiler_3.6.0 beeswarm_0.2.3 +## [71] tibble_2.1.3 statmod_1.4.32 +## [73] geneplotter_1.62.0 stringi_1.4.3 +## [75] highr_0.8 GenomicFeatures_1.36.4 +## [77] lattice_0.20-38 Matrix_1.2-17 +## [79] vctrs_0.2.0 lifecycle_0.1.0 +## [81] pillar_1.4.2 BiocNeighbors_1.2.0 +## [83] cowplot_1.0.0 bitops_1.0-6 +## [85] orthopolynom_1.0-5 irlba_2.3.3 +## [87] rtracklayer_1.44.4 R6_2.4.0 +## [89] latticeExtra_0.6-28 hwriter_1.3.2 +## [91] bookdown_0.13 gridExtra_2.3 +## [93] vipor_0.4.5 codetools_0.2-16 +## [95] MASS_7.3-51.1 assertthat_0.2.1 +## [97] withr_2.1.2 sctransform_0.2.0 +## [99] GenomeInfoDbData_1.2.1 hms_0.5.1 +## [101] grid_3.6.0 tidyr_1.0.0 +## [103] rmarkdown_1.15 DelayedMatrixStats_1.6.1 +## [105] Rtsne_0.15 ggbeeswarm_0.6.0 +``` diff --git a/public/remove-conf_files/figure-html/effectiveness1-1.png b/public/remove-conf_files/figure-html/effectiveness1-1.png index c5e039b166293314e3807e6ebee8df11d2f69ed5..e70598e05e8bca57cf22c6d5866c6b9ffa5967b1 100644 Binary files a/public/remove-conf_files/figure-html/effectiveness1-1.png and b/public/remove-conf_files/figure-html/effectiveness1-1.png differ diff --git a/public/remove-conf_files/figure-html/effectiveness1-10.png b/public/remove-conf_files/figure-html/effectiveness1-10.png index a05b69fb90123e0f70f9906e8c7fcedc684d0c3d..1275fa0894f1109efbf5f7e6cac5d623592bd475 100644 Binary files a/public/remove-conf_files/figure-html/effectiveness1-10.png and b/public/remove-conf_files/figure-html/effectiveness1-10.png differ diff --git a/public/remove-conf_files/figure-html/effectiveness1-11.png b/public/remove-conf_files/figure-html/effectiveness1-11.png index ea9aadf9b72ed9cf62420abcc0e5a1cac9373db3..0fcb438ced3a77d05c9a61f8d360906e2d3036f4 100644 Binary files a/public/remove-conf_files/figure-html/effectiveness1-11.png and b/public/remove-conf_files/figure-html/effectiveness1-11.png differ diff --git a/public/remove-conf_files/figure-html/effectiveness1-12.png b/public/remove-conf_files/figure-html/effectiveness1-12.png index dfa339a8211cd2857b402d20ed82fae884302148..ff1a3cf4b3dc80b88a7d0dd8e492aed9708c71b3 100644 Binary files a/public/remove-conf_files/figure-html/effectiveness1-12.png and b/public/remove-conf_files/figure-html/effectiveness1-12.png differ diff --git a/public/remove-conf_files/figure-html/effectiveness1-2.png b/public/remove-conf_files/figure-html/effectiveness1-2.png index 0abf122277ee582c26435cf05032afb0ebde17f2..9377e11d07597e3da0426152f22210e19dec973a 100644 Binary files a/public/remove-conf_files/figure-html/effectiveness1-2.png and b/public/remove-conf_files/figure-html/effectiveness1-2.png differ diff --git a/public/remove-conf_files/figure-html/effectiveness1-3.png b/public/remove-conf_files/figure-html/effectiveness1-3.png index 2fcb774bbd34841a4cd6e2e20bbba4ef9923c261..f2a4a304a48d5ea14c5b0d189fc7a0df10cbca38 100644 Binary files a/public/remove-conf_files/figure-html/effectiveness1-3.png and b/public/remove-conf_files/figure-html/effectiveness1-3.png differ diff --git a/public/remove-conf_files/figure-html/effectiveness1-4.png b/public/remove-conf_files/figure-html/effectiveness1-4.png index 9c206446992730d18c112ff36785b688ff6d53f3..8ae541953b32387ba30c9fd4e774988a319534f4 100644 Binary files a/public/remove-conf_files/figure-html/effectiveness1-4.png and b/public/remove-conf_files/figure-html/effectiveness1-4.png differ diff --git a/public/remove-conf_files/figure-html/effectiveness1-5.png b/public/remove-conf_files/figure-html/effectiveness1-5.png index 6fe6bdd961d3493bab160eae8e35c8748e250c87..2f06c8ddc9ab14f89d6b6a6c81ac872e990a50e5 100644 Binary files a/public/remove-conf_files/figure-html/effectiveness1-5.png and b/public/remove-conf_files/figure-html/effectiveness1-5.png differ diff --git a/public/remove-conf_files/figure-html/effectiveness1-6.png b/public/remove-conf_files/figure-html/effectiveness1-6.png index eae937b4342fa587a8f225d5df1048738aadc358..cd889afd560311d1da2f1bf53701b26f94a34a0c 100644 Binary files a/public/remove-conf_files/figure-html/effectiveness1-6.png and b/public/remove-conf_files/figure-html/effectiveness1-6.png differ diff --git a/public/remove-conf_files/figure-html/effectiveness1-7.png b/public/remove-conf_files/figure-html/effectiveness1-7.png index 50eb4666234959c3587ab8eb3ef36e3d63e98c66..a1b4f725b26aff072973ed1582010b3bc3e3f22c 100644 Binary files a/public/remove-conf_files/figure-html/effectiveness1-7.png and b/public/remove-conf_files/figure-html/effectiveness1-7.png differ diff --git a/public/remove-conf_files/figure-html/effectiveness1-8.png b/public/remove-conf_files/figure-html/effectiveness1-8.png index 02938c5eea67a06d3eb677e2ec866d3afb0cd32b..471e580223d769c5023ff11dd36262fbe0df8378 100644 Binary files a/public/remove-conf_files/figure-html/effectiveness1-8.png and b/public/remove-conf_files/figure-html/effectiveness1-8.png differ diff --git a/public/remove-conf_files/figure-html/effectiveness1-9.png b/public/remove-conf_files/figure-html/effectiveness1-9.png index 27d166ccabdfb259e2c4ff6d280e2eece0049983..4fd8fdd236db7e573cdd96698397b9245b5de285 100644 Binary files a/public/remove-conf_files/figure-html/effectiveness1-9.png and b/public/remove-conf_files/figure-html/effectiveness1-9.png differ diff --git a/public/remove-conf_files/figure-html/effectiveness2-1.png b/public/remove-conf_files/figure-html/effectiveness2-1.png index 8a55fd4b01fb1055f26545438a43a52cdd002b9a..ad84ca1dfa10c00eda17bd9ee3c25265f0221fe9 100644 Binary files a/public/remove-conf_files/figure-html/effectiveness2-1.png and b/public/remove-conf_files/figure-html/effectiveness2-1.png differ diff --git a/public/remove-conf_files/figure-html/harmony-1.png b/public/remove-conf_files/figure-html/harmony-1.png index 5988c3c0be0d7b5e9cd6001413e96a8c7aa624e3..4865356d78a802ad1285a7c05ce0208e872dbd38 100644 Binary files a/public/remove-conf_files/figure-html/harmony-1.png and b/public/remove-conf_files/figure-html/harmony-1.png differ diff --git a/public/remove-conf_files/figure-html/kbet-1.png b/public/remove-conf_files/figure-html/kbet-1.png index 3d36ddc08e42b2908fcf757843c1192fffdd36fc..650e37ab133c25052ef7d75984e09d0a450d4b0f 100644 Binary files a/public/remove-conf_files/figure-html/kbet-1.png and b/public/remove-conf_files/figure-html/kbet-1.png differ diff --git a/public/remove-conf_files/figure-html/norm-pca-sctransform-1.png b/public/remove-conf_files/figure-html/norm-pca-sctransform-1.png new file mode 100644 index 0000000000000000000000000000000000000000..c51b1e7b1e36f16d389b4f6a226eba2d28e167df Binary files /dev/null and b/public/remove-conf_files/figure-html/norm-pca-sctransform-1.png differ diff --git a/public/remove-conf_files/figure-html/sctransform-params-plot-1.png b/public/remove-conf_files/figure-html/sctransform-params-plot-1.png new file mode 100644 index 0000000000000000000000000000000000000000..ee691de32d5d40b1f1accffe01ecc5f17d3a8cd0 Binary files /dev/null and b/public/remove-conf_files/figure-html/sctransform-params-plot-1.png differ diff --git a/public/resources.html b/public/resources.html index 2706b2e76ef96d7042ef52b17f2e0fd65eb7b823..be208207513321e7f6a2410864c943783a77c88e 100644 --- a/public/resources.html +++ b/public/resources.html @@ -339,7 +339,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.4.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-3"><i class="fa fa-check"></i><b>7.4.6</b> sessionInfo()</a></li> </ul></li> <li class="chapter" data-level="7.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#identifying-confounding-factors-reads"><i class="fa fa-check"></i><b>7.5</b> Identifying confounding factors (Reads)</a></li> -<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#dealing-with-confounders"><i class="fa fa-check"></i><b>7.6</b> Dealing with confounders</a><ul> +<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#batch-effects"><i class="fa fa-check"></i><b>7.6</b> Batch effects</a><ul> <li class="chapter" data-level="7.6.1" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#introduction-6"><i class="fa fa-check"></i><b>7.6.1</b> Introduction</a></li> <li class="chapter" data-level="7.6.2" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#linear-models"><i class="fa fa-check"></i><b>7.6.2</b> Linear models</a></li> <li class="chapter" data-level="7.6.3" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sctransform-2"><i class="fa fa-check"></i><b>7.6.3</b> sctransform</a></li> @@ -347,7 +347,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.6.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#combat"><i class="fa fa-check"></i><b>7.6.5</b> Combat</a></li> <li class="chapter" data-level="7.6.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#mnncorrect"><i class="fa fa-check"></i><b>7.6.6</b> mnnCorrect</a></li> <li class="chapter" data-level="7.6.7" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#harmony"><i class="fa fa-check"></i><b>7.6.7</b> Harmony</a></li> -<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-confounder-removal-strategies"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare confounder removal strategies</a></li> +<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-batch-correction"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare batch correction</a></li> <li class="chapter" data-level="7.6.9" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#big-exercise-2"><i class="fa fa-check"></i><b>7.6.9</b> Big Exercise</a></li> <li class="chapter" data-level="7.6.10" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-4"><i class="fa fa-check"></i><b>7.6.10</b> sessionInfo()</a></li> </ul></li> @@ -370,14 +370,13 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="9.1.2" data-path="latent-spaces.html"><a href="latent-spaces.html#tsne-t-distributed-stochastic-neighbor-embedding"><i class="fa fa-check"></i><b>9.1.2</b> tSNE: t-Distributed Stochastic Neighbor Embedding</a></li> <li class="chapter" data-level="9.1.3" data-path="latent-spaces.html"><a href="latent-spaces.html#manifold-methods"><i class="fa fa-check"></i><b>9.1.3</b> Manifold methods</a></li> </ul></li> -<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a></li> +<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a><ul> +<li class="chapter" data-level="9.2.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom-interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.2.1</b> <span>Slalom</span>: Interpretable latent spaces</a></li> +</ul></li> <li class="chapter" data-level="9.3" data-path="latent-spaces.html"><a href="latent-spaces.html#autoencoders"><i class="fa fa-check"></i><b>9.3</b> Autoencoders</a><ul> <li class="chapter" data-level="9.3.1" data-path="latent-spaces.html"><a href="latent-spaces.html#background-and-some-notations"><i class="fa fa-check"></i><b>9.3.1</b> Background and some notations</a></li> <li class="chapter" data-level="9.3.2" data-path="latent-spaces.html"><a href="latent-spaces.html#objective"><i class="fa fa-check"></i><b>9.3.2</b> Objective</a></li> </ul></li> -<li class="chapter" data-level="9.4" data-path="latent-spaces.html"><a href="latent-spaces.html#interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.4</b> Interpretable latent spaces</a><ul> -<li class="chapter" data-level="9.4.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom"><i class="fa fa-check"></i><b>9.4.1</b> Slalom</a></li> -</ul></li> </ul></li> <li class="chapter" data-level="10" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html"><i class="fa fa-check"></i><b>10</b> Clustering and cell annotation</a><ul> <li class="chapter" data-level="10.1" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html#clustering-methods"><i class="fa fa-check"></i><b>10.1</b> Clustering Methods</a><ul> @@ -399,15 +398,16 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="11.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#first-look-at-deng-data"><i class="fa fa-check"></i><b>11.1</b> First look at Deng data</a><ul> <li class="chapter" data-level="11.1.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#tscan"><i class="fa fa-check"></i><b>11.1.1</b> TSCAN</a></li> <li class="chapter" data-level="11.1.2" data-path="trajectory-inference.html"><a href="trajectory-inference.html#slingshot"><i class="fa fa-check"></i><b>11.1.2</b> Slingshot</a></li> -<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.3</b> Monocle</a></li> -<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.4</b> Monocle 2</a></li> -<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 3</a></li> -<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.6</b> Diffusion maps</a></li> -<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.7</b> Other methods</a></li> -<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.8</b> Comparison of the methods</a></li> -<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.9</b> Expression of genes through time</a></li> -<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.10</b> dynverse</a></li> -<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.11</b> sessionInfo()</a></li> +<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#gam-general-additive-model-for-identifying-temporally-expressed-genes"><i class="fa fa-check"></i><b>11.1.3</b> GAM general additive model for identifying temporally expressed genes</a></li> +<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.4</b> Monocle</a></li> +<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 2</a></li> +<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.6</b> Monocle 3</a></li> +<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.7</b> Diffusion maps</a></li> +<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.8</b> Other methods</a></li> +<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.9</b> Comparison of the methods</a></li> +<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.10</b> Expression of genes through time</a></li> +<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.11</b> dynverse</a></li> +<li class="chapter" data-level="11.1.12" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.12</b> sessionInfo()</a></li> </ul></li> </ul></li> <li class="chapter" data-level="12" data-path="dechapter.html"><a href="dechapter.html"><i class="fa fa-check"></i><b>12</b> Differential Expression (DE) analysis</a><ul> diff --git a/public/search_index.json b/public/search_index.json index f8b0f164217c659a793136db39e30600b6dd2475..933afc84fd202dcec4cae16339988fafe7d04720 100644 --- a/public/search_index.json +++ b/public/search_index.json @@ -1,16 +1,16 @@ [ -["index.html", "Analysis of single cell RNA-seq data 1 About the course 1.1 Web page 1.2 GitLab 1.3 Video 1.4 Docker image 1.5 Manual installation 1.6 Citation 1.7 License 1.8 Prerequisites 1.9 Contact", " Analysis of single cell RNA-seq data Davis McCarthy (davisjmcc), Ruqian Lyu, PuXue Qiao, Vladimir Kiselev (wikiselev), Tallulah Andrews (talandrews), Jennifer Westoby (Jenni_Westoby), Maren Büttner (marenbuettner), Jimmy Lee (THJimmyLee), Krzysztof Polanski, Sebastian Y. Müller, Elo Madissoon, Stephane Ballereau, Maria Do Nascimento Lopes Primo, Rocio Martinez Nunez and Martin Hemberg (m_hemberg) 2019-10-01 1 About the course Today it is possible to obtain genome-wide transcriptome data from single cells using high-throughput sequencing (scRNA-seq). The cellular resolution and the genome-wide scope of scRNA-seq makes it possible to address issues that are intractable using other methods like bulk RNA-seq or single-cell RT-qPCR. However, scRNA-seq data poses many challenges due to the scale and complexity of scRNA-seq datasets, with novel methods often required to account for the particular characteristics of the data. In this course we will discuss some of the questions that can be addressed using scRNA-seq as well as the available computational and statistical methods. We will cover key features of the technology platforms and fundamental principles of scRNA-seq data analysis that are transferable across technologies and analysis workflows. The number of computational tools is already vast and increasing rapidly, so we provide hands-on workflows using some of our favourite tools on carefully selected, biologically-relevant example datasets. Across two days, attendees can expect to gain an understanding of approaches to and practical analysis experience on: quality control, data normalisation, visualisation, clustering, trajectory (pseudotime) inference, differential expression, batch correction and data integration. Course outline: Day 1: Morning session 1: Workshop overview; introduction to scRNA-seq; pre-processing scRNA-seq data Morning session 2: Quality control, visualisation and exploratory data analysis Afternoon session 1: Normalisation, confounders and batch correction Afternoon session 2: Latent spaces, clustering and cell annotation Day 2: Morning session 1: Trajectory inference Morning session 2: Differential expression; data imputation Afternoon session 1: Combining datasets and data integration Afternoon session 2: Case studies This course has been adapted from a course taught through the University of Cambridge Bioinformatics training unit, but the material is meant for anyone interested in learning about computational analysis of scRNA-seq data and is updated roughly twice per year. The number of computational tools is increasing rapidly and we are doing our best to keep up to date with what is available. One of the main constraints for this course is that we would like to use tools that are implemented in R and that run reasonably fast. Moreover, we will also confess to being somewhat biased towards methods that have been developed either by us or by our friends and colleagues. 1.1 Web page The html version of the workshop material is available at the following link: https://biocellgen-public.svi.edu.au/mig_2019_scrnaseq-workshop/public/index.html 1.2 GitLab The source code and materials for the course are available at the SVI Bioinformatics and Cellular Genomics Lab’s GitLab: https://gitlab.svi.edu.au/biocellgen-public/mig_2019_scrnaseq-workshop 1.3 Video This video was recorded during the course (2 days) in May 2019 in Cambridge, UK. This recorded version of the course differs slightly from the version in this document. 1.3.1 Day 1 1.3.2 Day 2 1.4 Docker image The course can be reproduced without any package installation by running the course docker image which contains all the required packages. Workshop Docker Repository on DockerHub 1.4.1 Run the image Make sure Docker is installed on your system. If not, please follow these instructions. To run the course docker image (use the latest version): docker run -p 8888:8888 -e PASSWORD="jupyter" svibiocellgen/mig_2019_scrnaseq-workshop:latest Then follow the instructions provided, e.g.: To access the notebook, open this file in a browser: file:///home/jovyan/.local/share/jupyter/runtime/nbserver-6-open.html Or copy and paste one of these URLs: http://(a9ee1aad5398 or 127.0.0.1):8888/?token=22debff49d9aae3c50e6b0b5241b47eefb1b8f883fcb7e6d A Jupyter session will be open in a web browser (we recommend Chrome). 1.4.1.1 Windows users On Windows operating system the IP address of the container can be different from 127.0.0.1 (localhost). To find the IP address please run: docker-machine ip default 1.4.2 Download data/other files 1.4.2.1 Download from AWS (within Docker) Recommended if you are using Docker In the Jupyter session, please click on New -> Terminal. In the new terminal window please run: ./poststart.sh 1.4.2.2 Manual download from AWS If you want to download data files from AWS outside of Docker image you can still use the same poststart.sh script but you will need to install AWS CLI on your computer. Alternatively, you can browse and download the files in you web-browser by visiting this link. NB: Only the core datasets (i.e. not Tabula Muris) are available from AWS storage. 1.4.2.3 Manual download from SVI Recommended if you are using your own computer For simplicity, we have also hosted the core datasets used in the course and a subset of the Tabula Muris data on SVI websites. There are two files to download, both “tarballsâ€, i.e. compressed archives of multiple folders and files. 1.4.2.3.1 Core datasets To download the core datasets, click this link (195Mb). It is most convenient to download the tarball to the head directory for the course. We then want to unpack the tarball and move it to a directory called data in the head directory of the repository. To do this at the command line: wget https://www.svi.edu.au/MIG_2019_scRNAseq-workshop/mig-sc-workshop-2019-data.tar.gz mkdir workshop-data tar -xvf mig-sc-workshop-2019-data.tar.gz --directory workshop-data mv workshop-data/mnt/mcfiles/Datasets/MIG_2019_scRNAseq-workshop/data ./ rm -r workshop-data [This requires a little bit of faff to get all of the directory paths correct and then tidy updated.] Alternatively, if you are working on your laptop, unpack the tarball using the default method on your system (usually a double click on the *.tar.gz file will do the trick) and drag and drop the data folder to the workshop directory. 1.4.2.3.2 Tabula Muris To download the Tabula Muris data, clink this link (655Mb). We then go through a similar process as described above to unpack the tarball. wget https://www.svi.edu.au/MIG_2019_scRNAseq-workshop/Tabula_Muris.tar.gz tar -xvf Tabula_Muris.tar.gz mv mnt/mcfiles/Datasets/Tabula_Muris data rm -r mnt 1.4.2.3.3 Desired results The data folder then should contain both the core datasets and the Tabula Muris data, and have the following structure: data ├── 10cells_barcodes.txt ├── 2000_reference.transcripts.fa ├── deng │  └── deng-reads.rds ├── droplet_id_example_per_barcode.txt.gz ├── droplet_id_example_truth.gz ├── EXAMPLE.cram ├── pancreas │  ├── muraro.rds │  └── segerstolpe.rds ├── pbmc3k_filtered_gene_bc_matrices │  └── hg19 │  ├── barcodes.tsv │  ├── genes.tsv │  └── matrix.mtx ├── sce │  ├── Heart_10X.rds │  └── Thymus_10X.rds ├── Tabula_Muris │  ├── droplet │  │  ├── droplet │  │  ├── droplet_annotation.csv │  │  └── droplet_metadata.csv │  └── FACS_smartseq2 │  ├── FACS │  ├── FACS_annotations.csv │  └── FACS_metadata.csv └── tung ├── annotation.txt ├── molecules.txt ├── reads.txt ├── TNs.txt └── TPs.txt 11 directories, 22 files With the files in these locations, everything is set up to run the code as presented in the RMarkdown files in the workshop. 1.4.3 RStudio Now go back to Jupyter browser tab and change word tree in the url to rstudio. RStudio server will open with all of the course files, software and the data folder available. 1.5 Manual installation If you are not using a docker image of the course, then to be able to run all code chunks of the course you need to clone or download the course GitHub repository and start an R session in the course_files folder. You will also need to install all required packages manually. We are using Bioconductor version 3.9 packages in this version of the course. The install.R file in the workshop repository provides the necessary commands for installing all of the required packages. You can run this script from the command line with Rscript install.R or copy-and-paste the commands into an R session and run them interactively. Alternatively, you can just install packages listed in a chapter of interest. 1.6 Citation This version of the workshop has been updated by Davis J. McCarthy, Ruqian Lyu and PuXue Qiao, based on the 2019-07-01 version of the course: Ruqian Lyu, PuXue Qiao, Vladimir Kiselev, Tallulah Andrews, Jennifer Westoby, Maren Büttner, Jimmy Lee, Krzysztof Polanski, Sebastian Y. Müller, Elo Madissoon, Stephane Ballereau, Maria Do Nascimento Lopes Primo, Rocio Martinez Nunez, Martin Hemberg and Davis J. McCarthy, (2019), “Analysis of single cell RNA-seq dataâ€, https://scrnaseq-course.cog.sanger.ac.uk/website/index.html 1.7 License All of the course material is licensed under GPL-3. Anyone is welcome to go through the material in order to learn about analysis of scRNA-seq data. If you plan to use the material for your own teaching, we would appreciate if you tell us about it in addition to providing a suitable citation. 1.8 Prerequisites The course is intended for those who have basic familiarity with Unix and the R statistical language. We will also assume that you are familiar with mapping and analysing bulk RNA-seq data as well as with the commonly available computational tools. We recommend attending the Introduction to RNA-seq and ChIP-seq data analysis or the Analysis of high-throughput sequencing data with Bioconductor before attending this course. 1.9 Contact If you have any comments, questions or suggestions about the material, please contact Davis McCarthy. "], +["index.html", "Analysis of single cell RNA-seq data 1 About the course 1.1 Web page 1.2 GitLab 1.3 Video 1.4 Docker image 1.5 Manual installation 1.6 Citation 1.7 License 1.8 Prerequisites 1.9 Contact", " Analysis of single cell RNA-seq data Davis McCarthy (davisjmcc), Ruqian Lyu, PuXue Qiao, Vladimir Kiselev (wikiselev), Tallulah Andrews (talandrews), Jennifer Westoby (Jenni_Westoby), Maren Büttner (marenbuettner), Jimmy Lee (THJimmyLee), Krzysztof Polanski, Sebastian Y. Müller, Elo Madissoon, Stephane Ballereau, Maria Do Nascimento Lopes Primo, Rocio Martinez Nunez and Martin Hemberg (m_hemberg) 2019-10-01 1 About the course Today it is possible to obtain genome-wide transcriptome data from single cells using high-throughput sequencing (scRNA-seq). The cellular resolution and the genome-wide scope of scRNA-seq makes it possible to address issues that are intractable using other methods like bulk RNA-seq or single-cell RT-qPCR. However, scRNA-seq data poses many challenges due to the scale and complexity of scRNA-seq datasets, with novel methods often required to account for the particular characteristics of the data. In this course we will discuss some of the questions that can be addressed using scRNA-seq as well as the available computational and statistical methods. We will cover key features of the technology platforms and fundamental principles of scRNA-seq data analysis that are transferable across technologies and analysis workflows. The number of computational tools is already vast and increasing rapidly, so we provide hands-on workflows using some of our favourite tools on carefully selected, biologically-relevant example datasets. Across two days, attendees can expect to gain an understanding of approaches to and practical analysis experience on: quality control, data normalisation, visualisation, clustering, trajectory (pseudotime) inference, differential expression, batch correction and data integration. Course outline: Day 1: Morning session 1: Workshop overview; introduction to scRNA-seq; pre-processing scRNA-seq data Morning session 2: Quality control, visualisation and exploratory data analysis Afternoon session 1: Normalisation, confounders and batch correction Afternoon session 2: Latent spaces, clustering and cell annotation Day 2: Morning session 1: Trajectory inference Morning session 2: Differential expression; data imputation Afternoon session 1: Combining datasets and data integration Afternoon session 2: Case studies This course has been adapted from a course taught through the University of Cambridge Bioinformatics training unit, but the material is meant for anyone interested in learning about computational analysis of scRNA-seq data and is updated roughly twice per year. The number of computational tools is increasing rapidly and we are doing our best to keep up to date with what is available. One of the main constraints for this course is that we would like to use tools that are implemented in R and that run reasonably fast. Moreover, we will also confess to being somewhat biased towards methods that have been developed either by us or by our friends and colleagues. 1.1 Web page The html version of the workshop material is available at the following link: https://biocellgen-public.svi.edu.au/mig_2019_scrnaseq-workshop/public/index.html 1.2 GitLab The source code and materials for the course are available at the SVI Bioinformatics and Cellular Genomics Lab’s GitLab: https://gitlab.svi.edu.au/biocellgen-public/mig_2019_scrnaseq-workshop 1.3 Video This video was recorded during the course (2 days) in May 2019 in Cambridge, UK. This recorded version of the course differs slightly from the version in this document. 1.3.1 Day 1 1.3.2 Day 2 1.4 Docker image The course can be reproduced without any package installation by running the course docker image which contains all the required packages. Workshop Docker Repository on DockerHub 1.4.1 Run the image Make sure Docker is installed on your system. If not, please follow these instructions. To run the course docker image (use the latest version): docker run -p 8888:8888 -e PASSWORD="jupyter" svibiocellgen/mig_2019_scrnaseq-workshop:v1.01 Then follow the instructions provided, e.g.: To access the notebook, open this file in a browser: file:///home/jovyan/.local/share/jupyter/runtime/nbserver-6-open.html Or copy and paste one of these URLs: http://(a9ee1aad5398 or 127.0.0.1):8888/?token=22debff49d9aae3c50e6b0b5241b47eefb1b8f883fcb7e6d A Jupyter session will be open in a web browser (we recommend Chrome). 1.4.1.1 Windows users On Windows operating system the IP address of the container can be different from 127.0.0.1 (localhost). To find the IP address please run: docker-machine ip default 1.4.2 Download data/other files 1.4.2.1 Download from AWS (within Docker) Recommended if you are using Docker In the Jupyter session, please click on New -> Terminal. In the new terminal window please run: ./poststart.sh 1.4.2.2 Manual download from AWS If you want to download data files from AWS outside of Docker image you can still use the same poststart.sh script but you will need to install AWS CLI on your computer. Alternatively, you can browse and download the files in you web-browser by visiting this link. NB: Only the core datasets (i.e. not Tabula Muris) are available from AWS storage. 1.4.2.3 Manual download from SVI Recommended if you are using your own computer For simplicity, we have also hosted the core datasets used in the course and a subset of the Tabula Muris data on SVI websites. There are two files to download, both “tarballsâ€, i.e. compressed archives of multiple folders and files. 1.4.2.3.1 Core datasets To download the core datasets, click this link (195Mb). It is most convenient to download the tarball to the head directory for the course. We then want to unpack the tarball and move it to a directory called data in the head directory of the repository. To do this at the command line: wget https://www.svi.edu.au/MIG_2019_scRNAseq-workshop/mig-sc-workshop-2019-data.tar.gz mkdir workshop-data tar -xvf mig-sc-workshop-2019-data.tar.gz --directory workshop-data mv workshop-data/mnt/mcfiles/Datasets/MIG_2019_scRNAseq-workshop/data ./ rm -r workshop-data [This requires a little bit of faff to get all of the directory paths correct and then tidy updated.] Alternatively, if you are working on your laptop, unpack the tarball using the default method on your system (usually a double click on the *.tar.gz file will do the trick) and drag and drop the data folder to the workshop directory. 1.4.2.3.2 Tabula Muris To download the Tabula Muris data, clink this link (655Mb). We then go through a similar process as described above to unpack the tarball. wget https://www.svi.edu.au/MIG_2019_scRNAseq-workshop/Tabula_Muris.tar.gz tar -xvf Tabula_Muris.tar.gz mv mnt/mcfiles/Datasets/Tabula_Muris data rm -r mnt 1.4.2.3.3 Desired results The data folder then should contain both the core datasets and the Tabula Muris data, and have the following structure: data ├── 10cells_barcodes.txt ├── 2000_reference.transcripts.fa ├── deng │  └── deng-reads.rds ├── droplet_id_example_per_barcode.txt.gz ├── droplet_id_example_truth.gz ├── EXAMPLE.cram ├── pancreas │  ├── muraro.rds │  └── segerstolpe.rds ├── pbmc3k_filtered_gene_bc_matrices │  └── hg19 │  ├── barcodes.tsv │  ├── genes.tsv │  └── matrix.mtx ├── sce │  ├── Heart_10X.rds │  └── Thymus_10X.rds ├── Tabula_Muris │  ├── droplet │  │  ├── droplet │  │  ├── droplet_annotation.csv │  │  └── droplet_metadata.csv │  └── FACS_smartseq2 │  ├── FACS │  ├── FACS_annotations.csv │  └── FACS_metadata.csv └── tung ├── annotation.txt ├── molecules.txt ├── reads.txt ├── TNs.txt └── TPs.txt 11 directories, 22 files With the files in these locations, everything is set up to run the code as presented in the RMarkdown files in the workshop. 1.4.3 RStudio Now go back to Jupyter browser tab and change word tree in the url to rstudio. RStudio server will open with all of the course files, software and the data folder available. 1.5 Manual installation If you are not using a docker image of the course, then to be able to run all code chunks of the course you need to clone or download the course GitHub repository and start an R session in the course_files folder. You will also need to install all required packages manually. We are using Bioconductor version 3.9 packages in this version of the course. The install.R file in the workshop repository provides the necessary commands for installing all of the required packages. You can run this script from the command line with Rscript install.R or copy-and-paste the commands into an R session and run them interactively. Alternatively, you can just install packages listed in a chapter of interest. 1.6 Citation This version of the workshop has been updated by Davis J. McCarthy, Ruqian Lyu and PuXue Qiao, based on the 2019-07-01 version of the course: Ruqian Lyu, PuXue Qiao, Vladimir Kiselev, Tallulah Andrews, Jennifer Westoby, Maren Büttner, Jimmy Lee, Krzysztof Polanski, Sebastian Y. Müller, Elo Madissoon, Stephane Ballereau, Maria Do Nascimento Lopes Primo, Rocio Martinez Nunez, Martin Hemberg and Davis J. McCarthy, (2019), “Analysis of single cell RNA-seq dataâ€, https://scrnaseq-course.cog.sanger.ac.uk/website/index.html 1.7 License All of the course material is licensed under GPL-3. Anyone is welcome to go through the material in order to learn about analysis of scRNA-seq data. If you plan to use the material for your own teaching, we would appreciate if you tell us about it in addition to providing a suitable citation. 1.8 Prerequisites The course is intended for those who have basic familiarity with Unix and the R statistical language. We will also assume that you are familiar with mapping and analysing bulk RNA-seq data as well as with the commonly available computational tools. We recommend attending the Introduction to RNA-seq and ChIP-seq data analysis or the Analysis of high-throughput sequencing data with Bioconductor before attending this course. 1.9 Contact If you have any comments, questions or suggestions about the material, please contact Davis McCarthy. "], ["introduction-to-single-cell-rna-seq.html", "2 Introduction to single-cell RNA-seq 2.1 Bulk RNA-seq 2.2 scRNA-seq 2.3 Workflow 2.4 Computational Analysis 2.5 Challenges 2.6 Experimental methods 2.7 What platform to use for my experiment? 2.8 Unique Molecular Identifiers (UMIs)", " 2 Introduction to single-cell RNA-seq 2.1 Bulk RNA-seq A major breakthrough (replaced microarrays) in the late 00’s and has been widely used since Measures the average expression level for each gene across a large population of input cells Useful for comparative transcriptomics, e.g. samples of the same tissue from different species Useful for quantifying expression signatures from ensembles, e.g. in disease studies Insufficient for studying heterogeneous systems, e.g. early development studies, complex tissues (brain) Does not provide insights into the stochastic nature of gene expression 2.2 scRNA-seq A new technology, first publication by (Tang et al. 2009) Did not gain widespread popularity until ~2014 when new protocols and lower sequencing costs made it more accessible Measures the distribution of expression levels for each gene across a population of cells Allows to study new biological questions in which cell-specific changes in transcriptome are important, e.g. cell type identification, heterogeneity of cell responses, stochasticity of gene expression, inference of gene regulatory networks across the cells. Datasets range from \\(10^2\\) to \\(10^6\\) cells and increase in size every year Currently there are several different protocols in use, e.g. SMART-seq2 (Picelli et al. 2013), CELL-seq (Hashimshony et al. 2012) and Drop-seq (Macosko et al. 2015) There are also commercial platforms available, including the Fluidigm C1, Wafergen ICELL8 and the 10X Genomics Chromium Several computational analysis methods from bulk RNA-seq can be used In most cases computational analysis requires adaptation of the existing methods or development of new ones 2.3 Workflow Figure 2.1: Single cell sequencing (taken from Wikipedia) Overall, experimental scRNA-seq protocols are similar to the methods used for bulk RNA-seq. We will be discussing some of the most common approaches in the next chapter. 2.4 Computational Analysis This course is concerned with the computational analysis of the data obtained from scRNA-seq experiments. The first steps (yellow) are general for any highthroughput sequencing data. Later steps (orange) require a mix of existing RNASeq analysis methods and novel methods to address the technical difference of scRNASeq. Finally the biological interpretation (blue) should be analyzed with methods specifically developed for scRNASeq. Figure 2.2: Flowchart of the scRNA-seq analysis There are several reviews of the scRNA-seq analysis available including (Stegle, Teichmann, and Marioni 2015). Today, there are also several different platforms available for carrying out one or more steps in the flowchart above. These include: Falco a single-cell RNA-seq processing framework on the cloud. SCONE (Single-Cell Overview of Normalized Expression), a package for single-cell RNA-seq data quality control and normalization. Seurat is an R package designed for QC, analysis, and exploration of single cell RNA-seq data. ASAP (Automated Single-cell Analysis Pipeline) is an interactive web-based platform for single-cell analysis. Bioconductor is a open-source, open-development software project for the analysis of high-throughput genomics data, including packages for the analysis of single-cell data. 2.5 Challenges The main difference between bulk and single cell RNA-seq is that each sequencing library represents a single cell, instead of a population of cells. Therefore, significant attention has to be paid to comparison of the results from different cells (sequencing libraries). The main sources of discrepancy between the libraries are: Amplification (up to 1 million fold) Gene ‘dropouts’ in which a gene is observed at a moderate expression level in one cell but is not detected in another cell (Kharchenko, Silberstein, and Scadden 2014). In both cases the discrepancies are introduced due to low starting amounts of transcripts since the RNA comes from one cell only. Improving the transcript capture efficiency and reducing the amplification bias are currently active areas of research. However, as we shall see in this course, it is possible to alleviate some of these issues through proper normalization and corrections. 2.6 Experimental methods Figure 2.3: Moore’s law in single cell transcriptomics (image taken from Svensson et al) Development of new methods and protocols for scRNA-seq is currently a very active area of research, and several protocols have been published over the last few years. A non-comprehensive list includes: CEL-seq (Hashimshony et al. 2012) CEL-seq2 (Hashimshony et al. 2016) Drop-seq (Macosko et al. 2015) InDrop-seq (Klein et al. 2015) MARS-seq (Jaitin et al. 2014) SCRB-seq (Soumillon et al. 2014) Seq-well (Gierahn et al. 2017) Smart-seq (Picelli et al. 2014) Smart-seq2 (Picelli et al. 2014) SMARTer STRT-seq (Islam et al. 2013) The methods can be categorized in different ways, but the two most important aspects are quantification and capture. For quantification, there are two types, full-length and tag-based. The former tries to achieve a uniform read coverage of each transcript. By contrast, tag-based protocols only capture either the 5’- or 3’-end of each RNA. The choice of quantification method has important implications for what types of analyses the data can be used for. In theory, full-length protocols should provide an even coverage of transcripts, but as we shall see, there are often biases in the coverage. The main advantage of tag-based protocol is that they can be combined with unique molecular identifiers (UMIs) which can help improve the quantification (see chapter 2.8). On the other hand, being restricted to one end of the transcript may reduce the mappability and it also makes it harder to distinguish different isoforms (Archer et al. 2016). The strategy used for capture determines throughput, how the cells can be selected as well as what kind of additional information besides the sequencing that can be obtained. The three most widely used options are microwell-, microfluidic- and droplet- based. Figure 2.4: Image of microwell plates (image taken from Wikipedia) For well-based platforms, cells are isolated using for example pipette or laser capture and placed in microfluidic wells. One advantage of well-based methods is that they can be combined with fluorescent activated cell sorting (FACS), making it possible to select cells based on surface markers. This strategy is thus very useful for situations when one wants to isolate a specific subset of cells for sequencing. Another advantage is that one can take pictures of the cells. The image provides an additional modality and a particularly useful application is to identify wells containg damaged cells or doublets. The main drawback of these methods is that they are often low-throughput and the amount of work required per cell may be considerable. Figure 2.5: Image of a 96-well Fluidigm C1 chip (image taken from Fluidigm) Microfluidic platforms, such as Fluidigm’s C1, provide a more integrated system for capturing cells and for carrying out the reactions necessary for the library preparations. Thus, they provide a higher throughput than microwell based platforms. Typically, only around 10% of cells are captured in a microfluidic platform and thus they are not appropriate if one is dealing with rare cell-types or very small amounts of input. Moreover, the chip is relatively expensive, but since reactions can be carried out in a smaller volume money can be saved on reagents. Figure 2.6: Schematic overview of the drop-seq method (Image taken from Macosko et al) The idea behind droplet based methods is to encapsulate each individual cell inside a nanoliter droplet together with a bead. The bead is loaded with the enzymes required to construct the library. In particular, each bead contains a unique barcode which is attached to all of the reads originating from that cell. Thus, all of the droplets can be pooled, sequenced together and the reads can subsequently be assigned to the cell of origin based on the barcodes. Droplet platforms typically have the highest throughput since the library preparation costs are on the order of \\(.05\\) USD/cell. Instead, sequencing costs often become the limiting factor and a typical experiment the coverage is low with only a few thousand different transcripts detected (Ziegenhain et al. 2017). 2.7 What platform to use for my experiment? The most suitable platform depends on the biological question at hand. For example, if one is interested in characterizing the composition of a tissue, then a droplet-based method which will allow a very large number of cells to be captured is likely to be the most appropriate. On the other hand, if one is interesting in characterizing a rare cell-population for which there is a known surface marker, then it is probably best to enrich using FACS and then sequence a smaller number of cells. Clearly, full-length transcript quantification will be more appropriate if one is interested in studying different isoforms since tagged protocols are much more limited. By contrast, UMIs can only be used with tagged protocols and they can facilitate gene-level quantification. Two recent studies from the Enard group (Ziegenhain et al. 2017) and the Teichmann group (Svensson et al. 2017) have compared several different protocols. In their study, Ziegenhain et al compared five different protocols on the same sample of mouse embryonic stem cells (mESCs). By controlling for the number of cells as well as the sequencing depth, the authors were able to directly compare the sensitivity, noise-levels and costs of the different protocols. One example of their conclusions is illustrated in the figure below which shows the number of genes detected (for a given detection threshold) for the different methods. As you can see, there is almost a two-fold difference between drop-seq and Smart-seq2, suggesting that the choice of protocol can have a major impact on the study Figure 2.7: Enard group study Svensson et al take a different approach by using synthetic transcripts (spike-ins, more about these later) with known concentrations to measure the accuracy and sensitivity of different protocols. Comparing a wide range of studies, they also reported substantial differences between the protocols. Figure 2.8: Teichmann group study As protocols are developed and computational methods for quantifying the technical noise are improved, it is likely that future studies will help us gain further insights regarding the strengths of the different methods. These comparative studies are helpful not only for helping researchers decide which protocol to use, but also for developing new methods as the benchmarking makes it possible to determine what strategies are the most useful ones. 2.8 Unique Molecular Identifiers (UMIs) Thanks to Andreas Buness from EMBL Monterotondo for collaboration on this section. 2.8.1 Introduction Unique Molecular Identifiers are short (4-10bp) random barcodes added to transcripts during reverse-transcription. They enable sequencing reads to be assigned to individual transcript molecules and thus the removal of amplification noise and biases from scRNASeq data. Figure 2.9: UMI sequencing protocol When sequencing UMI containing data, techniques are used to specifically sequence only the end of the transcript containing the UMI (usually the 3’ end). 2.8.2 Mapping Barcodes Since the number of unique barcodes (\\(4^N\\), where \\(N\\) is the length of UMI) is much smaller than the total number of molecules per cell (~\\(10^6\\)), each barcode will typically be assigned to multiple transcripts. Hence, to identify unique molecules both barcode and mapping location (transcript) must be used. The first step is to map UMI reads, for which we recommend using STAR since it is fast and outputs good quality BAM-alignments. Moreover, mapping locations can be useful for eg. identifying poorly-annotated 3’ UTRs of transcripts. UMI-sequencing typically consists of paired-end reads where one read from each pair captures the cell and UMI barcodes while the other read consists of exonic sequence from the transcript (Figure 2.10). Note that trimming and/or filtering to remove reads containing poly-A sequence is recommended to avoid erors due to these read mapping to genes/transcripts with internal poly-A/poly-T sequences. After processing the reads from a UMI experiment, the following conventions are often used: The UMI is added to the read name of the other paired read. Reads are sorted into separate files by cell barcode For extremely large, shallow datasets, the cell barcode may be added to the read name as well to reduce the number of files. Figure 2.10: UMI sequencing reads, red lightning bolts represent different fragmentation locations 2.8.3 Counting Barcodes In theory, every unique UMI-transcript pair should represent all reads originating from a single RNA molecule. However, in practice this is frequently not the case and the most common reasons are: Different UMI does not necessarily mean different molecule Due to PCR or sequencing errors, base-pair substitution events can result in new UMI sequences. Longer UMIs give more opportunity for errors to arise and based on estimates from cell barcodes we expect 7-10% of 10bp UMIs to contain at least one error. If not corrected for, this type of error will result in an overestimate of the number of transcripts. Different transcript does not necessarily mean different molecule Mapping errors and/or multimapping reads may result in some UMIs being assigned to the wrong gene/transcript. This type of error will also result in an overestimate of the number of transcripts. Same UMI does not necessarily mean same molecule Biases in UMI frequency and short UMIs can result in the same UMI being attached to different mRNA molecules from the same gene. Thus, the number of transcripts may be underestimated. Figure 2.11: Potential Errors in UMIs 2.8.4 Correcting for Errors How to best account for errors in UMIs remains an active area of research. The best approaches that we are aware of for resolving the issues mentioned above are: UMI-tools’ directional-adjacency method implements a procedure which considers both the number of mismatches and the relative frequency of similar UMIs to identify likely PCR/sequencing errors. Currently an open question. The problem may be mitigated by removing UMIs with few reads to support their association with a particular transcript, or by removing all multi-mapping reads. Simple saturation (aka “collision probabilityâ€) correction proposed by Grun, Kester and van Oudenaarden (2014) to estimate the true number of molecules \\(M\\): \\[M \\approx -N*log(1 - \\frac{n}{N})\\] where N = total number of unique UMI barcodes and n = number of observed barcodes. An important caveat of this method is that it assumes that all UMIs are equally frequent. In most cases this is incorrect, since there is often a bias related to the GC content. Figure 2.12: Per gene amplification rate Determining how to best process and use UMIs is currently an active area of research in the bioinformatics community. We are aware of several methods that have recently been developed, including: UMI-tools PoissonUMIs zUMIs dropEst 2.8.5 Downstream Analysis Current UMI platforms (DropSeq, InDrop, ICell8) exhibit low and highly variable capture efficiency as shown in the figure below. Figure 2.13: Variability in Capture Efficiency This variability can introduce strong biases and it needs to be considered in downstream analysis. Recent analyses often pool cells/genes together based on cell-type or biological pathway to increase the power. Robust statistical analyses of this data is still an open research question and it remains to be determined how to best adjust for biases. Exercise 1 We have provided you with UMI counts and read counts from induced pluripotent stem cells generated from three different individuals (Tung et al. 2017) (see: Chapter 6.1 for details of this dataset). umi_counts <- read.table("data/tung/molecules.txt", sep = "\\t") read_counts <- read.table("data/tung/reads.txt", sep = "\\t") Using this data: Plot the variability in capture efficiency Determine the amplification rate: average number of reads per UMI. References "], -["introduction-to-rbioconductor.html", "3 Introduction to R/Bioconductor 3.1 Installing packages 3.2 Installation instructions: 3.3 Data-types/classes 3.4 Basic data structures 3.5 Accessing documentation and help files 3.6 Data Types 3.7 What is Bioconductor? 3.8 SingleCellExperiment class 3.9 scater package 3.10 Introduction to ggplot2", " 3 Introduction to R/Bioconductor 3.1 Installing packages 3.1.1 CRAN The Comprehensive R Archive Network CRAN is the biggest archive of R packages. There are few requirements for uploading packages besides building and installing succesfully, hence documentation and support is often minimal and figuring how to use these packages can be a challenge it itself. CRAN is the default repository R will search to find packages to install: install.packages("devtools") require("devtools") 3.1.2 Github Github isn’t specific to R, any code of any type in any state can be uploaded. There is no guarantee a package uploaded to github will even install, nevermind do what it claims to do. R packages can be downloaded and installed directly from github using the “devtools†package installed above. devtools::install_github("tallulandrews/M3Drop") Github is also a version control system which stores multiple versions of any package. By default the most recent “master†version of the package is installed. If you want an older version or the development branch this can be specified using the “ref†parameter: # different branch devtools::install_github("tallulandrews/M3D", ref="nbumi") # previous commit devtools::install_github("tallulandrews/M3Drop", ref="434d2da28254acc8de4940c1dc3907ac72973135") Note: make sure you re-install the M3Drop master branch for later in the course. 3.1.3 Bioconductor Bioconductor is a repository of R-packages specifically for biological analyses. It has the strictest requirements for submission, including installation on every platform and full documentation with a tutorial (called a vignette) explaining how the package should be used. Bioconductor also encourages utilization of standard data structures/classes and coding style/naming conventions, so that, in theory, packages and analyses can be combined into large pipelines or workflows. source("https://bioconductor.org/biocLite.R") biocLite("edgeR") Note: in some situations it is necessary to substitute “http://†for “https://†in the above depending on the security features of your internet connection/network. Bioconductor also requires creators to support their packages and has a regular 6-month release schedule. Make sure you are using the most recent release of bioconductor before trying to install packages for the course. source("https://bioconductor.org/biocLite.R") biocLite("BiocUpgrade") 3.1.4 Source The final way to install packages is directly from source. In this case you have to download a fully built source code file, usually packagename.tar.gz, or clone the github repository and rebuild the package yourself. Generally this will only be done if you want to edit a package yourself, or if for some reason the former methods have failed. install.packages("M3Drop_3.05.00.tar.gz", type="source") 3.2 Installation instructions: All the packages necessary for this course are available here. Starting from “RUN Rscript -eâ€install.packages(‘devtools’)" “, run each of the commands (minusâ€RUN") on the command line or start an R session and run each of the commands within the quotation marks. Note the ordering of the installation is important in some cases, so make sure you run them in order from top to bottom. 3.3 Data-types/classes R is a high level language so the underlying data-type is generally not important. The exception if you are accessing R data directly using another language such as C, but that is beyond the scope of this course. Instead we will consider the basic data classes: numeric, integer, logical, and character, and the higher level data class called “factorâ€. You can check what class your data is using the “class()†function. Aside: R can also store data as “complex†for complex numbers but generally this isn’t relevant for biological analyses. 3.3.1 Numeric The “numeric†class is the default class for storing any numeric data - integers, decimal numbers, numbers in scientific notation, etc… x = 1.141 class(x) ## [1] "numeric" y = 42 class(y) ## [1] "numeric" z = 6.02e23 class(z) ## [1] "numeric" Here we see that even though R has an “integer†class and 42 could be stored more efficiently as an integer the default is to store it as “numericâ€. If we want 42 to be stored as an integer we must “coerce†it to that class: y = as.integer(42) class(y) ## [1] "integer" Coercion will force R to store data as a particular class, if our data is incompatible with that class it will still do it but the data will be converted to NAs: as.numeric("H") ## Warning: NAs introduced by coercion ## [1] NA Above we tried to coerce “character†data, identified by the double quotation marks, into numeric data which doesn’t make sense, so we triggered (“threwâ€) an warning message. Since this is only a warning R would continue with any subsequent commands in a script/function, whereas an “error†would cause R to halt. 3.3.2 Character/String The “character†class stores all kinds of text data. Programing convention calls data containing multiple letters a “stringâ€, thus most R functions which act on character data will refer to the data as “strings†and will often have “str†or “string†in it’s name. Strings are identified by being flanked by double quotation marks, whereas variable/function names are not: x = 5 a = "x" # character "x" a ## [1] "x" b = x # variable x b ## [1] 5 In addition to standard alphanumeric characters, strings can also store various special characters. Special characters are identified using a backlash followed by a single character, the most relevant are the special character for tab : \\t and new line : \\n. To demonstrate the these special characters lets concatenate (cat) together two strings with these characters separating (sep) them: cat("Hello", "World", sep= " ") ## Hello World cat("Hello", "World", sep= "\\t") ## Hello World cat("Hello", "World", sep= "\\n") ## Hello ## World Note that special characters work differently in different functions. For instance the paste function does the same thing as cat but does not recognize special characters. paste("Hello", "World", sep= " ") ## [1] "Hello World" paste("Hello", "World", sep= "\\t") ## [1] "Hello\\tWorld" paste("Hello", "World", sep= "\\n") ## [1] "Hello\\nWorld" Single or double backslash is also used as an escape character to turn off special characters or allow quotation marks to be included in strings: cat("This \\"string\\" contains quotation marks.") ## This "string" contains quotation marks. Special characters are generally only used in pattern matching, and reading/writing data to files. For instance this is how you would read a tab-separated file into R. dat = read.delim("file.tsv", sep="\\t") Another special type of character data are colours. Colours can be specified in three main ways: by name from those available, by red, green, blue values using the rgb function, and by hue (colour), saturation (colour vs white) and value (colour/white vs black) using the hsv function. By default rgb and hsv expect three values in 0-1 with an optional fourth value for transparency. Alternatively, sets of predetermined colours with useful properties can be loaded from many different packages with RColorBrewer being one of the most popular. reds = c("red", rgb(1,0,0), hsv(0, 1, 1)) reds ## [1] "red" "#FF0000" "#FF0000" barplot(c(1,1,1), col=reds, names=c("by_name", "by_rgb", "by_hsv")) 3.3.3 Logical The logical class stores boolean truth values, i.e. TRUE and FALSE. It is used for storing the results of logical operations and conditional statements will be coerced to this class. Most other data-types can be coerced to boolean without triggering (or “throwingâ€) error messages, which may cause unexpected behaviour. x = TRUE class(x) ## [1] "logical" y = "T" as.logical(y) ## [1] TRUE z = 5 as.logical(z) ## [1] TRUE x = FALSE class(x) ## [1] "logical" y = "F" as.logical(y) ## [1] FALSE z = 0 as.logical(z) ## [1] FALSE Exercise 1 Experiment with other character and numeric values, which are coerced to TRUE or FALSE? which are coerced to neither? Do you ever throw a warning/error message? 3.3.4 Factors String/Character data is very memory inefficient to store, each letter generally requires the same amount of memory as any integer. Thus when storing a vector of strings with repeated elements it is more efficient assign each element to an integer and store the vector as integers and an additional string-to-integer association table. Thus, by default R will read in text columns of a data table as factors. str_vector = c("Apple", "Apple", "Banana", "Banana", "Banana", "Carrot", "Carrot", "Apple", "Banana") factored_vector = factor(str_vector) factored_vector ## [1] Apple Apple Banana Banana Banana Carrot Carrot Apple Banana ## Levels: Apple Banana Carrot as.numeric(factored_vector) ## [1] 1 1 2 2 2 3 3 1 2 The double nature of factors can cause some unintuitive behaviour. E.g. joining two factors together will convert them to the numeric form and the original strings will be lost. c(factored_vector, factored_vector) ## [1] 1 1 2 2 2 3 3 1 2 1 1 2 2 2 3 3 1 2 Likewise if due to formatting issues numeric data is mistakenly interpretted as strings, then you must convert the factor back to strings before coercing to numeric values: x = c("20", "25", "23", "38", "20", "40", "25", "30") x = factor(x) as.numeric(x) ## [1] 1 3 2 5 1 6 3 4 as.numeric(as.character(x)) ## [1] 20 25 23 38 20 40 25 30 To make R read text as character data instead of factors set the environment option stringsAsFactors=FALSE. This must be done at the start of each R session. options(stringsAsFactors=FALSE) Exercise How would you use factors to create a vector of colours for an arbitrarily long vector of fruits like str_vector above? Answer 3.3.5 Checking class/type We recommend checking your data is of the correct class after reading from files: x = 1.4 is.numeric(x) ## [1] TRUE is.character(x) ## [1] FALSE is.logical(x) ## [1] FALSE is.factor(x) ## [1] FALSE 3.4 Basic data structures So far we have only looked at single values and vectors. Vectors are the simplest data structure in R. They are a 1-dimensional array of data all of the same type. If the input when creating a vector is of different types it will be coerced to the data-type that is most consistent with the data. x = c("Hello", 5, TRUE) x ## [1] "Hello" "5" "TRUE" class(x) ## [1] "character" Here we tried to put character, numeric and logical data into a single vector so all the values were coerced to character data. A matrix is the two dimensional version of a vector, it also requires all data to be of the same type. If we combine a character vector and a numeric vector into a matrix, all the data will be coerced to characters: x = c("A", "B", "C") y = c(1, 2, 3) class(x) ## [1] "character" class(y) ## [1] "numeric" m = cbind(x, y) m ## x y ## [1,] "A" "1" ## [2,] "B" "2" ## [3,] "C" "3" The quotation marks indicate that the numeric vector has been coerced to characters. Alternatively, to store data with columns of different data-types we can use a dataframe. z = data.frame(x, y) z ## x y ## 1 A 1 ## 2 B 2 ## 3 C 3 class(z[,1]) ## [1] "character" class(z[,2]) ## [1] "numeric" If you have set stringsAsFactors=FALSE as above you will find the first column remains characters, otherwise it will be automatically converted to a factor. options(stringsAsFactors=TRUE) z = data.frame(x, y) class(z[,1]) ## [1] "factor" Another difference between matrices and dataframes is the ability to select columns using the $ operator: m$x # throws an error z$x # ok The final basic data structure is the list. Lists allow data of different types and different lengths to be stored in a single object. Each element of a list can be any other R object : data of any type, any data structure, even other lists or functions. l = list(m, z) ll = list(sublist=l, a_matrix=m, numeric_value=42, this_string="Hello World", even_a_function=cbind) ll ## $sublist ## $sublist[[1]] ## x y ## [1,] "A" "1" ## [2,] "B" "2" ## [3,] "C" "3" ## ## $sublist[[2]] ## x y ## 1 A 1 ## 2 B 2 ## 3 C 3 ## ## ## $a_matrix ## x y ## [1,] "A" "1" ## [2,] "B" "2" ## [3,] "C" "3" ## ## $numeric_value ## [1] 42 ## ## $this_string ## [1] "Hello World" ## ## $even_a_function ## function (..., deparse.level = 1) ## .Internal(cbind(deparse.level, ...)) ## <bytecode: 0x55bacbd9d118> ## <environment: namespace:base> Lists are most commonly used when returning a large number of results from a function that do not fit into any of the previous data structures. 3.5 Accessing documentation and help files You can get more information about any R commands relevant to these datatypes using by typing ?function in an interactive session. 3.6 Data Types 3.6.1 What is Tidy Data? Tidy data is a concept largely defined by Hadley Wickham (Wickham 2014). Tidy data has the following three characteristics: Each variable has its own column. Each observation has its own row. Each value has its own cell. Here is an example of some tidy data: ## Students Subject Years Score ## 1 Mark Maths 1 5 ## 2 Jane Biology 2 6 ## 3 Mohammed Physics 3 4 ## 4 Tom Maths 2 7 ## 5 Celia Computing 3 9 Here is an example of some untidy data: ## Students Sport Category Counts ## 1 Matt Tennis Wins 0 ## 2 Matt Tennis Losses 1 ## 3 Ellie Rugby Wins 3 ## 4 Ellie Rugby Losses 2 ## 5 Tim Football Wins 1 ## 6 Tim Football Losses 4 ## 7 Louise Swimming Wins 2 ## 8 Louise Swimming Losses 2 ## 9 Kelly Running Wins 5 ## 10 Kelly Running Losses 1 Task 1: In what ways is the untidy data not tidy? How could we make the untidy data tidy? Tidy data is generally easier to work with than untidy data, especially if you are working with packages such as ggplot. Fortunately, packages are available to make untidy data tidy. Today we will explore a few of the functions available in the tidyr package which can be used to make untidy data tidy. If you are interested in finding out more about tidying data, we recommend reading “R for Data Scienceâ€, by Garrett Grolemund and Hadley Wickham. An electronic copy is available here: http://r4ds.had.co.nz/ The untidy data above is untidy because two variables (Wins and Losses) are stored in one column (Category). This is a common way in which data can be untidy. To tidy this data, we need to make Wins and Losses into columns, and store the values in Counts in these columns. Fortunately, there is a function from the tidyverse packages to perform this operation. The function is called spread, and it takes two arguments, key and value. You should pass the name of the column which contains multiple variables to key, and pass the name of the column which contains values from multiple variables to value. For example: library(tidyverse) sports<-data.frame(Students=c("Matt", "Matt", "Ellie", "Ellie", "Tim", "Tim", "Louise", "Louise", "Kelly", "Kelly"), Sport=c("Tennis","Tennis", "Rugby", "Rugby","Football", "Football","Swimming","Swimming", "Running", "Running"), Category=c("Wins", "Losses", "Wins", "Losses", "Wins", "Losses", "Wins", "Losses", "Wins", "Losses"), Counts=c(0,1,3,2,1,4,2,2,5,1)) sports ## Students Sport Category Counts ## 1 Matt Tennis Wins 0 ## 2 Matt Tennis Losses 1 ## 3 Ellie Rugby Wins 3 ## 4 Ellie Rugby Losses 2 ## 5 Tim Football Wins 1 ## 6 Tim Football Losses 4 ## 7 Louise Swimming Wins 2 ## 8 Louise Swimming Losses 2 ## 9 Kelly Running Wins 5 ## 10 Kelly Running Losses 1 spread(sports, key=Category, value=Counts) ## Students Sport Losses Wins ## 1 Ellie Rugby 2 3 ## 2 Kelly Running 1 5 ## 3 Louise Swimming 2 2 ## 4 Matt Tennis 1 0 ## 5 Tim Football 4 1 Task 2: The dataframe foods defined below is untidy. Work out why and use spread() to tidy it foods<-data.frame(student=c("Antoinette","Antoinette","Taylor", "Taylor", "Alexa", "Alexa"), Category=c("Dinner", "Dessert", "Dinner", "Dessert", "Dinner","Dessert"), Frequency=c(3,1,4,5,2,1)) The other common way in which data can be untidy is if the columns are values instead of variables. For example, the dataframe below shows the percentages some students got in tests they did in May and June. The data is untidy because the columns May and June are values, not variables. percentages<-data.frame(student=c("Alejandro", "Pietro", "Jane"), "May"=c(90,12,45), "June"=c(80,30,100)) Fortunately, there is a function in the tidyverse packages to deal with this problem too. gather() takes the names of the columns which are values, the key and the value as arguments. This time, the key is the name of the variable with values as column names, and the value is the name of the variable with values spread over multiple columns. Ie: gather(percentages, "May", "June", key="Month", value = "Percentage") ## student Month Percentage ## 1 Alejandro May 90 ## 2 Pietro May 12 ## 3 Jane May 45 ## 4 Alejandro June 80 ## 5 Pietro June 30 ## 6 Jane June 100 These examples don’t have much to do with single-cell RNA-seq analysis, but are designed to help illustrate the features of tidy and untidy data. You will find it much easier to analyse your single-cell RNA-seq data if your data is stored in a tidy format. Fortunately, the data structures we commonly use to facilitate single-cell RNA-seq analysis usually encourage store your data in a tidy manner. 3.6.2 What is Rich Data? If you google ‘rich data’, you will find lots of different definitions for this term. In this course, we will use ‘rich data’ to mean data which is generated by combining information from multiple sources. For example, you could make rich data by creating an object in R which contains a matrix of gene expression values across the cells in your single-cell RNA-seq experiment, but also information about how the experiment was performed. Objects of the SingleCellExperiment class, which we will discuss below, are an example of rich data. Typically, Bioconductor packages make use of rich data objects that have many advantages for package developers and users alike. 3.7 What is Bioconductor? From Wikipedia: Bioconductor is a free, open source and open development software project for the analysis and comprehension of genomic data generated by wet lab experiments in molecular biology. Bioconductor is based primarily on the statistical R programming language, but does contain contributions in other programming languages. It has two releases each year that follow the semiannual releases of R. At any one time there is a release version,which corresponds to the released version of R, and a development version, which corresponds to the development version of R. Most users will find the release version appropriate for their needs. We strongly recommend all new comers and even experienced high-throughput data analysts to use well developed and maintained Bioconductor methods and classes. 3.8 SingleCellExperiment class SingleCellExperiment (SCE) is a S4 class for storing data from single-cell experiments. This includes specialized methods to store and retrieve spike-in information, dimensionality reduction coordinates and size factors for each cell, along with the usual metadata for genes and libraries. In practice, an object of this class can be created using its constructor: library(SingleCellExperiment) counts <- matrix(rpois(100, lambda = 10), ncol=10, nrow=10) rownames(counts) <- paste("gene", 1:10, sep = "") colnames(counts) <- paste("cell", 1:10, sep = "") sce <- SingleCellExperiment( assays = list(counts = counts), rowData = data.frame(gene_names = paste("gene_name", 1:10, sep = "")), colData = data.frame(cell_names = paste("cell_name", 1:10, sep = "")) ) sce ## class: SingleCellExperiment ## dim: 10 10 ## metadata(0): ## assays(1): counts ## rownames(10): gene1 gene2 ... gene9 gene10 ## rowData names(1): gene_names ## colnames(10): cell1 cell2 ... cell9 cell10 ## colData names(1): cell_names ## reducedDimNames(0): ## spikeNames(0): In the SingleCellExperiment, users can assign arbitrary names to entries of assays. To assist interoperability between packages, some suggestions for what the names should be for particular types of data are provided by the authors: counts: Raw count data, e.g., number of reads or transcripts for a particular gene. normcounts: Normalized values on the same scale as the original counts. For example, counts divided by cell-specific size factors that are centred at unity. logcounts: Log-transformed counts or count-like values. In most cases, this will be defined as log-transformed normcounts, e.g., using log base 2 and a pseudo-count of 1. cpm: Counts-per-million. This is the read count for each gene in each cell, divided by the library size of each cell in millions. tpm: Transcripts-per-million. This is the number of transcripts for each gene in each cell, divided by the total number of transcripts in that cell (in millions). Each of these suggested names has an appropriate getter/setter method for convenient manipulation of the SingleCellExperiment. For example, we can take the (very specifically named) counts slot, normalise it and assign it to normcounts instead: normcounts(sce) <- log2(counts(sce) + 1) sce ## class: SingleCellExperiment ## dim: 10 10 ## metadata(0): ## assays(2): counts normcounts ## rownames(10): gene1 gene2 ... gene9 gene10 ## rowData names(1): gene_names ## colnames(10): cell1 cell2 ... cell9 cell10 ## colData names(1): cell_names ## reducedDimNames(0): ## spikeNames(0): dim(normcounts(sce)) ## [1] 10 10 head(normcounts(sce)) ## cell1 cell2 cell3 cell4 cell5 cell6 cell7 ## gene1 3.169925 3.169925 2.000000 2.584963 2.584963 3.321928 3.584963 ## gene2 3.459432 1.584963 3.584963 3.807355 3.700440 3.700440 3.000000 ## gene3 3.000000 3.169925 3.807355 3.169925 3.321928 3.321928 3.321928 ## gene4 3.584963 3.459432 3.000000 3.807355 3.700440 3.700440 3.700440 ## gene5 3.906891 3.000000 3.169925 3.321928 3.584963 3.459432 3.807355 ## gene6 3.700440 3.700440 3.584963 4.000000 3.169925 3.000000 3.459432 ## cell8 cell9 cell10 ## gene1 3.321928 3.807355 2.807355 ## gene2 3.807355 3.700440 4.000000 ## gene3 2.584963 4.000000 3.700440 ## gene4 3.169925 3.584963 3.700440 ## gene5 3.807355 2.584963 3.584963 ## gene6 3.321928 3.459432 4.000000 3.9 scater package scater is a R package for single-cell RNA-seq analysis (McCarthy et al. 2017). The package contains several useful methods for quality control, visualisation and pre-processing of data prior to further downstream analysis. scater features the following functionality: Automated computation of QC metrics Transcript quantification from read data with pseudo-alignment Data format standardisation Rich visualizations for exploratory analysis Seamless integration into the Bioconductor universe Simple normalisation methods We highly recommend to use scater for all single-cell RNA-seq analyses and scater is the basis of the first part of the course. As illustrated in the figure below, scater will help you with quality control, filtering and normalization of your expression matrix following mapping and alignment. Keep in mind that this figure represents the original version of scater where an SCESet class was used. In the newest version this figure is still correct, except that SCESet can be substituted with the SingleCellExperiment class. 3.10 Introduction to ggplot2 3.10.1 What is ggplot2? ggplot2 is an R package designed by Hadley Wickham which facilitates data plotting. In this lab, we will touch briefly on some of the features of the package. If you would like to learn more about how to use ggplot2, we would recommend reading “ggplot2 Elegant graphics for data analysisâ€, by Hadley Wickham. 3.10.2 Principles of ggplot2 Your data must be a dataframe if you want to plot it using ggplot2. Use the aes mapping function to specify how variables in the dataframe map to features on your plot Use geoms to specify how your data should be represented on your graph eg. as a scatterplot, a barplot, a boxplot etc. 3.10.3 Using the aes mapping function The aes function specifies how variables in your dataframe map to features on your plot. To understand how this works, let’s look at an example: library(ggplot2) library(tidyverse) set.seed(1) counts <- as.data.frame(matrix(rpois(100, lambda = 10), ncol=10, nrow=10)) Gene_ids <- paste("gene", 1:10, sep = "") colnames(counts) <- paste("cell", 1:10, sep = "") counts<-data.frame(Gene_ids, counts) counts ## Gene_ids cell1 cell2 cell3 cell4 cell5 cell6 cell7 cell8 cell9 cell10 ## 1 gene1 8 8 3 5 5 9 11 9 13 6 ## 2 gene2 10 2 11 13 12 12 7 13 12 15 ## 3 gene3 7 8 13 8 9 9 9 5 15 12 ## 4 gene4 11 10 7 13 12 12 12 8 11 12 ## 5 gene5 14 7 8 9 11 10 13 13 5 11 ## 6 gene6 12 12 11 15 8 7 10 9 10 15 ## 7 gene7 11 11 14 11 11 5 9 13 13 7 ## 8 gene8 9 12 9 8 6 14 7 12 12 10 ## 9 gene9 14 12 11 7 10 10 8 14 7 10 ## 10 gene10 11 10 9 7 11 16 8 7 7 4 ggplot(data = counts, mapping = aes(x = cell1, y = cell2)) Let’s take a closer look at the final command, ggplot(data = counts, mapping = aes(x = cell1, y = cell2)). ggplot() initialises a ggplot object and takes the arguments data and mapping. We pass our dataframe of counts to data and use the aes() function to specify that we would like to use the variable cell1 as our x variable and the variable cell2 as our y variable. Task 1: Modify the command above to initialise a ggplot object where cell10 is the x variable and cell8 is the y variable. Clearly, the plots we have just created are not very informative because no data is displayed on them. To display data, we will need to use geoms. 3.10.4 Geoms We can use geoms to specify how we would like data to be displayed on our graphs. For example, our choice of geom could specify that we would like our data to be displayed as a scatterplot, a barplot or a boxplot. Let’s see how our graph would look as a scatterplot. ggplot(data = counts, mapping = aes(x = cell1, y = cell2)) + geom_point() Now we can see that there doesn’t seem to be any correlation between gene expression in cell1 and cell2. Given we generated counts randomly, this isn’t too surprising. Task 2: Modify the command above to create a line plot. Hint: execute ?ggplot and scroll down the help page. At the bottom is a link to the ggplot package index. Scroll through the index until you find the geom options. 3.10.5 Plotting data from more than 2 cells So far we’ve been considering the gene counts from 2 of the cells in our dataframe. But there are actually 10 cells in our dataframe and it would be nice to compare all of them. What if we wanted to plot data from all 10 cells at the same time? At the moment we can’t do this because we are treating each individual cell as a variable and assigning that variable to either the x or the y axis. We could create a 10 dimensional graph to plot data from all 10 cells on, but this is a) not possible to do with ggplot and b) not very easy to interpret. What we could do instead is to tidy our data so that we had one variable representing cell ID and another variable representing gene counts, and plot those against each other. In code, this would look like: counts<-gather(counts, colnames(counts)[2:11], key = 'Cell_ID', value='Counts') head(counts) ## Gene_ids Cell_ID Counts ## 1 gene1 cell1 8 ## 2 gene2 cell1 10 ## 3 gene3 cell1 7 ## 4 gene4 cell1 11 ## 5 gene5 cell1 14 ## 6 gene6 cell1 12 Essentially, the problem before was that our data was not tidy because one variable (Cell_ID) was spread over multiple columns. Now that we’ve fixed this problem, it is much easier for us to plot data from all 10 cells on one graph. ggplot(counts,aes(x=Cell_ID, y=Counts)) + geom_boxplot() Task 3: Use the updated counts dataframe to plot a barplot with Cell_ID as the x variable and Counts as the y variable. Hint: you may find it helpful to read ?geom_bar. Task 4: Use the updated counts dataframe to plot a scatterplot with Gene_ids as the x variable and Counts as the y variable. 3.10.6 Plotting heatmaps A common method for visualising gene expression data is with a heatmap. Here we will use the R package pheatmap to perform this analysis with some gene expression data we will name test. library(pheatmap) set.seed(2) test = matrix(rnorm(200), 20, 10) test[1:10, seq(1, 10, 2)] = test[1:10, seq(1, 10, 2)] + 3 test[11:20, seq(2, 10, 2)] = test[11:20, seq(2, 10, 2)] + 2 test[15:20, seq(2, 10, 2)] = test[15:20, seq(2, 10, 2)] + 4 colnames(test) = paste("Cell", 1:10, sep = "") rownames(test) = paste("Gene", 1:20, sep = "") pheatmap(test) Let’s take a moment to work out what this graphic is showing us. Each row represents a gene and each column represents a cell. How highly expressed each gene is in each cell is represented by the colour of the corresponding box. For example, we can tell from this plot that gene18 is highly expressed in cell10 but lowly expressed in cell1. This plot also gives us information on the results of a clustering algorithm. In general, clustering algorithms aim to split datapoints (eg.cells) into groups whose members are more alike one another than they are alike the rest of the datapoints. The trees drawn on the top and left hand sides of the graph are the results of clustering algorithms and enable us to see, for example, that cells 4,8,2,6 and 10 are more alike one another than they are alike cells 7,3,5,1 and 9. The tree on the left hand side of the graph represents the results of a clustering algorithm applied to the genes in our dataset. If we look closely at the trees, we can see that eventually they have the same number of branches as there are cells and genes. In other words, the total number of cell clusters is the same as the total number of cells, and the total number of gene clusters is the same as the total number of genes. Clearly, this is not very informative, and will become impractical when we are looking at more than 10 cells and 20 genes. Fortunately, we can set the number of clusters we see on the plot. Let’s try setting the number of gene clusters to 2: pheatmap(test, kmeans_k = 2) Now we can see that the genes fall into two clusters - a cluster of 8 genes which are upregulated in cells 2, 10, 6, 4 and 8 relative to the other cells and a cluster of 12 genes which are downregulated in cells 2, 10, 6, 4 and 8 relative to the other cells. Task 5: Try setting the number of clusters to 3. Which number of clusters do you think is more informative? 3.10.7 Principal Component Analysis Principal component analysis (PCA) is a statistical procedure that uses a transformation to convert a set of observations into a set of values of linearly uncorrelated variables called principal components. The transformation is carried out so that the first principle component accounts for as much of the variability in the data as possible, and each following principle component accounts for the greatest amount of variance possible under the contraint that it must be orthogonal to the previous components. PCA plots are a good way to get an overview of your data, and can sometimes help identify confounders which explain a high amount of the variability in your data. We will investigate how we can use PCA plots in single-cell RNA-seq analysis in more depth in a future lab, here the aim is to give you an overview of what PCA plots are and how they are generated. Let’s make a PCA plot for our test data. We can use the ggfortify package to let ggplot know how to interpret principle components. library(ggfortify) Principal_Components<-prcomp(test) autoplot(Principal_Components, label=TRUE) Task 6: Compare your clusters to the pheatmap clusters. Are they related? (Hint: have a look at the gene tree for the first pheatmap we plotted) Task 7: Produce a heatmap and PCA plot for counts (below): set.seed(1) counts <- as.data.frame(matrix(rpois(100, lambda = 10), ncol=10, nrow=10)) rownames(counts) <- paste("gene", 1:10, sep = "") colnames(counts) <- paste("cell", 1:10, sep = "") References "], -["datasets.html", "4 Datasets 4.1 Deng 4.2 Tung 4.3 Pancreas 4.4 Heart 4.5 Thymus 4.6 Tabula Muris 4.7 Introduction 4.8 Downloading the data 4.9 Reading the data (Smartseq2) 4.10 Building a SingleCellExperiment object 4.11 Reading the data (10X) 4.12 Building a SingleCellExperiment object for the 10X data 4.13 Advanced Exercise", " 4 Datasets Here we provide brief descriptions of the core datasets used in this course and a more detailed description of the Tabula Muris (mouse cell atlas) data, how it can be downloaded and how it can be used. 4.1 Deng A single-cell RNA-seq dataset of 268 individual cells dissociated from in vivo F1 embryos from oocyte to blastocyst stages of mouse preimplantation development. Single-cell transcriptome profiles were generated with Smart-seq or Smart-seq2 from each individual cell with spike-ins (NB: both the Smart-seq and Smart-seq2 protocols were used, for different sets of cells in the dataset). Cells annlysed here have been annotated with their developmental stages according to the original publication. Deng, Qiaolin, et al. “Single-cell RNA-seq reveals dynamic, random monoallelic gene expression in mammalian cells.†Science 343.6167 (2014) 193-196. 4.2 Tung A dataset of induced pluripotent stem cells generated from three different individuals with replicates (Tung et al. 2017) in Yoav Gilad’s lab at the University of Chicago. Data generated using Fluidigm C1 platform and to facilitate the quantification both unique molecular identifiers (UMIs) and ERCC spike-ins were used. The data files are located in the tung folder in your working directory. These files are the copies of the original files made on the 15/03/16. We will use these copies for reproducibility purposes. Tung, Po-Yuan, et al. “Batch effects and the effective design of single-cell gene expression studies.†Scientific reports 7 (2017): 39921. 4.3 Pancreas We have included two human pancreas datasets: from Muraro et al (2016) and Segerstolpe et al. (2016). Since the pancreas has been widely studied, these datasets are well annotated. 4.3.1 Muraro Single-cell CEL-seq data were generated using a customised automated platform that uses FACS, robotics, and the CEL-Seq2 protocol to obtain the transcriptomes of thousands of single pancreatic cells from four deceased organ donors. Cell surface markers can be used for sorting and enriching certain cell types. Muraro,M.J. et al. (2016) A Single-Cell Transcriptome Atlas of the Human Pancreas. Cell Syst, 3, 385–394.e3. 4.3.2 Segerstolpe Single-cell RNA-seq dataset of human pancreatic cells from patients with type 2 diabetes and healthy controls. Single cells were prepared using Smart-seq2 protocol and sequenced on an Illumina HiSeq 2000. Segerstolpe,Ã…. et al. (2016) Single-Cell Transcriptome Profiling of Human Pancreatic Islets in Health and Type 2 Diabetes. Cell Metab., 24, 593–607. 4.4 Heart data/sce/Heart_10X.rds is a SCE object containing cells from Heart tissue from the Tabula Muris dataset (details below) using 10X protocol. 4.5 Thymus data/sce/Thymus_10X.rds is a SCE object containing cells from Thymus tissue from the Tabula Muris dataset (details below) using 10X protocol. 4.6 Tabula Muris 4.7 Introduction To give you hands-on experience analyzing from start to finish a single-cell RNASeq dataset we will be using as an example, data from the Tabula Muris initial release. The Tabula Muris is an international collaboration with the aim to profile every cell-type in the mouse using a standardized method. They combine high-throughput but low-coverage 10X data with lower throughput but high-coverage FACS-sorted cells + Smartseq2. The initial release of the data (20 Dec 2017), contains almost 100,000 cells across 20 different tissues/organs. You might like to choose a tissue to focus on for a detailed analysis. 4.8 Downloading the data Unlike most single-cell RNA-seq data Tabula Muris has released their data through the figshare platform rather than uploading it to GEO or ArrayExpress. You can find the data by using the doi’s in their paper : 10.6084/m9.figshare.5715040 for FACS/Smartseq2 and 10.6084/m9.figshare.5715025 for 10X data. The data can be downloaded manually by clinking the doi links or by using the command-line commands below: Terminal-based download of FACS data: wget https://ndownloader.figshare.com/files/10038307 unzip 10038307 wget https://ndownloader.figshare.com/files/10038310 mv 10038310 FACS_metadata.csv wget https://ndownloader.figshare.com/files/10039267 mv 10039267 FACS_annotations.csv Terminal-based download of 10X data: wget https://ndownloader.figshare.com/files/10038325 unzip 10038325 wget https://ndownloader.figshare.com/files/10038328 mv 10038328 droplet_metadata.csv wget https://ndownloader.figshare.com/files/10039264 mv 10039264 droplet_annotation.csv Note if you download the data by hand you should unzip & rename the files as above before continuing. You should now have two folders : “FACS†and “droplet†and one annotation and metadata file for each. To inspect these files you can use the head to see the top few lines of the text files (Press “q†to exit): head -n 10 droplet_metadata.csv You can also check the number of rows in each file using: wc -l droplet_annotation.csv Exercise How many cells do we have annotations for from FACS? from 10X? nn Answer FACS : 54,838 cells Droplet : 42,193 cells 4.9 Reading the data (Smartseq2) We can now read in the relevant count matrix from the comma-separated file. Then inspect the resulting dataframe: dat <- read.delim("FACS/Kidney-counts.csv", sep=",", header=TRUE) dat[1:5,1:5] We can see that the first column in the dataframe is the gene names, so first we move these to the rownames so we have a numeric matrix: dim(dat) rownames(dat) <- dat[,1] dat <- dat[,-1] Since this is a Smart-seq2 dataset it may contain spike-ins so lets check: rownames(dat)[grep("^ERCC-", rownames(dat))] Now we can extract much of the metadata for this data from the column names: cellIDs <- colnames(dat) cell_info <- strsplit(cellIDs, "\\\\.") Well <- lapply(cell_info, function(x){x[1]}) Well <- unlist(Well) Plate <- unlist(lapply(cell_info, function(x){x[2]})) Mouse <- unlist(lapply(cell_info, function(x){x[3]})) We can check the distributions of each of these metadata classifications: summary(factor(Mouse)) We can also check if any technical factors are confounded: table(Mouse, Plate) Lastly we will read the computationally inferred cell-type annotation and match them to the cell in our expression matrix: ann <- read.table("FACS_annotations.csv", sep=",", header=TRUE) ann <- ann[match(cellIDs, ann[,1]),] celltype <- ann[,3] 4.10 Building a SingleCellExperiment object To create a SingleCellExperiment object we must put together all the cell annotations into a single dataframe, since the experimental batch (PCR plate) is completely confounded with donor mouse we will only keep one of them. library("SingleCellExperiment") library("scater") cell_anns <- data.frame(mouse = Mouse, well=Well, type=celltype) rownames(cell_anns) <- colnames(dat) sceset <- SingleCellExperiment(assays = list(counts = as.matrix(dat)), colData=cell_anns) Finally if the dataset contains spike-ins we a hidden variable in the SingleCellExperiment object to track them: isSpike(sceset, "ERCC") <- grepl("ERCC-", rownames(sceset)) 4.11 Reading the data (10X) Due to the large size and sparsity of 10X data (upto 90% of the expression matrix may be 0s) it is typically stored as a sparse matrix. The default output format for CellRanger is an .mtx file which stores this sparse matrix as a column of row coordinates, a column of column corodinates, and a column of expression values > 0. Note if you look at the .mtx file you will see two header lines followed by a line detailing the total number of rows, columns and counts for the full matrix. Since only the coordinates are stored in the .mtx file, the names of each row & column must be stored separately in the “genes.tsv†and “barcodes.tsv†files respectively. We will be using the “Matrix†package to store matrices in sparse-matrix format in R. The SingleCellExperiment class naturally handles parse matrices, and many downstream tools including scater, scran and DropletUtils also handle data stored in sparse matrices, reducing the memory requirements for many early steps in an analysis. The SingleCellExperiment class can also use data in HDF5 format which allows large non-sparse matrices to be stored & accessed on disk in an efficient manner rather than loading the whole thing into RAM. library("Matrix") cellbarcodes <- read.table("droplet/Kidney-10X_P4_5/barcodes.tsv") genenames <- read.table("droplet/Kidney-10X_P4_5/genes.tsv") molecules <- readMM("droplet/Kidney-10X_P4_5/matrix.mtx") Now we will add the appropriate row and column names. However, if you inspect the read cellbarcodes you will see that they are just the barcode sequence associated with each cell. This is a problem since each batch of 10X data uses the same pool of barcodes so if we need to combine data from multiple 10X batches the cellbarcodes will not be unique. Hence we will attach the batch ID to each cell barcode: head(cellbarcodes) rownames(molecules) <- genenames[,1] colnames(molecules) <- paste("10X_P4_5", cellbarcodes[,1], sep="_") Now lets get the metadata and computational annotations for this data: meta <- read.delim("droplet_metadata.csv", sep=",", header = TRUE) head(meta) Here we can see that we need to use 10X_P4_5 to find the metadata for this batch, also note that the format of the mouse ID is different in this metadata table with hyphens instead of underscores and with the gender in the middle of the ID. From checking the methods section of the accompanying paper we know that the same 8 mice were used for both droplet and plate-based techniques. So we need to fix the mouse IDs to be consistent with those used in the FACS experiments. meta[meta$channel == "10X_P4_5",] mouseID <- "3_8_M" Note: depending on the tissue you choose you may have 10X data from mixed samples : e.g. mouse id = 3-M-5/6. You should still reformat these to be consistent but they will not match mouse ids from the FACS data which may affect your downstream analysis. If the mice weren’t from an inbred strain it would be possible to assign individual cells to a specific mouse using exonic-SNPs but that is beyond the scope of this course. ann <- read.delim("droplet_annotation.csv", sep=",", header=TRUE) head(ann) Again you will find a slight formating difference between the cellID in the annotation and the cellbarcodes which we will have to correct before matching them. ann[,1] <- paste(ann[,1], "-1", sep="") ann_subset <- ann[match(colnames(molecules), ann[,1]),] celltype <- ann_subset[,3] Now lets build the cell-metadata dataframe: cell_anns <- data.frame(mouse = rep(mouseID, times=ncol(molecules)), type=celltype) rownames(cell_anns) <- colnames(molecules); Exercise Repeat the above for the other 10X batches for your tissue. Answer 4.12 Building a SingleCellExperiment object for the 10X data Now that we have read the 10X data in multiple batches we need to combine them into a single SingleCellExperiment object. First we will check that the gene names are the same and in the same order across all batches: identical(rownames(molecules1), rownames(molecules2)) identical(rownames(molecules1), rownames(molecules3)) Now we’ll check that there aren’t any repeated cellIDs: sum(colnames(molecules1) %in% colnames(molecules2)) sum(colnames(molecules1) %in% colnames(molecules3)) sum(colnames(molecules2) %in% colnames(molecules3)) Everything is ok, so we can go ahead and combine them: all_molecules <- cbind(molecules1, molecules2, molecules3) all_cell_anns <- as.data.frame(rbind(cell_anns1, cell_anns2, cell_anns3)) all_cell_anns$batch <- rep(c("10X_P4_5", "10X_P4_6","10X_P7_5"), times = c(nrow(cell_anns1), nrow(cell_anns2), nrow(cell_anns3))) Exercise How many cells are in the whole dataset? Answer Now build the SingleCellExperiment object. One of the advantages of the SingleCellExperiment class is that it is capable of storing data in normal matrix or sparse matrix format, as well as HDF5 format which allows large non-sparse matrices to be stored & accessed on disk in an efficient manner rather than loading the whole thing into RAM. all_molecules <- as.matrix(all_molecules) sceset <- SingleCellExperiment( assays = list(counts = as.matrix(all_molecules)), colData = all_cell_anns ) Since this is 10X data it will not contain spike-ins, so we just save the data: saveRDS(sceset, "kidney_droplet.rds") 4.13 Advanced Exercise Write an R function/script which will fully automate this procedure for each data-type for any tissue. "], -["processing-raw-scrna-seq-data.html", "5 Processing raw scRNA-seq data 5.1 Generating fastq files from BCLs 5.2 FastQC 5.3 Trimming Reads 5.4 Fastp 5.5 Read alignment and gene expression quantification 5.6 Full-length transcript datasets 5.7 Tag-based datasets 5.8 Practise 5.9 Identifying cell-containing droplets/microwells", " 5 Processing raw scRNA-seq data 5.1 Generating fastq files from BCLs BCLs (Illumina sequencer’s base call files) are binary files with raw sequencing data generated from sequencers. If your data processing starts BCLs you will need to make fastq files from the BCL files. More on BCL format. For others, you may have received the fastq files from your sequencing facilities or collaborators, you can refer to Section 5.2 for pre-processing on fastq files. 5.1.1 Demultiplexing In cases where multiple sample libraries are pooled together for sequencing on one lane of a flowcell to reduce seqeuncing cost, we demultiplex the samples by their sample index in the step of making fastq files from BCLs. Sample indices are ‘barcodes’ for multiplexed samples which have been constructed in the read structure during the library preparation. Figure 2.3: Example 10X Final Library Structure 5.1.2 cellranger mkfastq If you are working with 10X Genomiec data, it is best to use the cellranger mkfastq pipleline, which wraps Illumina’s bcl2fastq and provides a number of convenient features designed specifically for 10X data format. In order to demultiplex samples, you would also need the sample_sheet.csv file which tells the mkfastq pipeline which libraries are sequenced on which lanes of the flowcell and what sample index sets they have. For example when you have multiple libraries sequenced on one lane here: With cellranger mkfastq, you can provide a simpleSampleSheet.csv file that has: Lane Sample Index 1 test_sample SI-P03-C9 1 test_sample2 SI-P03-A3 ... SI-P03-C9 and SI-P03-A3 are the 10x sample index set names. Each of them corresponds to a mix of 4 unique oligonucleotides so that the i7 index read is balanced across all 4 bases during sequencing. There are a list of 96 sample index sets and you can use any ones of them to ‘tag’ your samples. An example command to run cellranger mkfastq /mnt/Software/cellranger/cellranger-3.0.2/cellranger mkfastq \\ --run ./input.runs.folder/ --samplesheet {input.samplesheet.csv} \\ --id run_id \\ --qc --project MAXL_2019_LIM_organoid_RNAseq \\ --output-dir data/fastq_path/ \\ --jobmode=local --localcores=20 --localmem=50 1> {log} After mkfastq, you end up with each sample’s fastq files from each sequencing lanes: test_sample_S1_L001_I1_001.fastq.gz test_sample_S1_L001_R1_001.fastq.gz test_sample_S1_L001_R2_001.fastq.gz test_sample2_S2_L001_I1_001.fastq.gz test_sample2_S2_L001_R1_001.fastq.gz test_sample2_S2_L001_R2_001.fastq.gz 5.1.3 Illumina bcl2fastq You can also use Illumina’s bcl2fastq tool directly and it is more generally applicable. bcf2fastq converts BCLs to fastqs while optionally demultiplexing sequencing data. Find the documentation of the tool here; training videos may also help to come to grips with using this tool. Figure 2.4: Sample Demultiplexing You will need to supply a SampleSheet.csv file like this: Figure 2.5: SampleSheet.csv file This information should come from your sequencing facilities. Running bcl2fastq can then be done like this: /usr/local/bin/bcl2fastq --runfolder-dir <RunFolder> --output-dir <BaseCalls> The output fastq files are names as SampleName_SampleNumber_Lane_Read_001.fastq.gz same with cellranger mkfastq output. (eg: Sample1_S1_L001_R1_001.fastq.gz) 5.2 FastQC Once you’ve obtained your single-cell RNA-seq data, the first thing you need to do with it is check the quality of the reads you have sequenced. For this task, today we will be using a tool called FastQC. FastQC is a quality control tool for sequencing data, which can be used for both bulk and single-cell RNA-seq data. FastQC takes sequencing data as input and returns a report on read quality. Copy and paste this link into your browser to visit the FastQC website: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ This website contains links to download and install FastQC and documentation on the reports produced. Scroll down the webpage to ‘Example Reports’ and click ‘Good Illumina Data’. This gives an example of what an ideal report should look like for high quality Illumina reads data. Now let’s make a FastQC report ourselves. Today we will be performing our analysis using a single cell from an mESC dataset produced by (Kolodziejczyk et al. 2015). The cells were sequenced using the SMART-seq2 library preparation protocol and the reads are paired end. Note You will have to download the files (both ERR522959_1.fastq and ERR522959_2.fastq) and create Share directory yourself to run the commands. You can find the files here: https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-2600/samples/ Now let’s look at the files: less Share/ERR522959_1.fastq less Share/ERR522959_2.fastq Task 1: Try to work out what command you should use to produce the FastQC report. Hint: Try executing fastqc -h This command will tell you what options are available to pass to FastQC. Feel free to ask for help if you get stuck! If you are successful, you should generate a .zip and a .html file for both the forwards and the reverse reads files. Once you have been successful, feel free to have a go at the next section. 5.2.1 Solution and Downloading the Report If you haven’t done so already, generate the FastQC report using the commands below: mkdir fastqc_results fastqc -o fastqc_results Share/ERR522959_1.fastq Share/ERR522959_2.fastq Once the command has finished executing, you should have a total of four files - one zip file for each of the paired end reads, and one html file for each of the paired end reads. The report is in the html file. To view it, we will need to get it onto your computer using either filezilla or scp. Ask an instructor if you are having difficulties. Once the file is on you computer, click on it. Your FastQC report should open. Have a look through the file. Remember to look at both the forwards and the reverse end read reports! How good quality are the reads? Is there anything we should be concerned about? How might we address those concerns? 5.2.2 10X fastq qualities checks If you have generated the fastq files from cellranger mkfastq as we discussed before, you can get a list of quality metrics from the output files. First thing to look at will be qc_summary.json file which contains lines like this: "sample_qc": { "Sample1": { "5": { "barcode_exact_match_ratio": 0.9336158258904611, "barcode_q30_base_ratio": 0.9611993091728814, "bc_on_whitelist": 0.9447542078230667, "mean_barcode_qscore": 37.770630795934, "number_reads": 2748155, "read1_q30_base_ratio": 0.8947676653366835, "read2_q30_base_ratio": 0.7771883245304577 }, "all": { "barcode_exact_match_ratio": 0.9336158258904611, "barcode_q30_base_ratio": 0.9611993091728814, "bc_on_whitelist": 0.9447542078230667, "mean_barcode_qscore": 37.770630795934, "number_reads": 2748155, "read1_q30_base_ratio": 0.8947676653366835, "read2_q30_base_ratio": 0.7771883245304577 } } } 5.3 Trimming Reads Fortunately there is software available for read trimming. Today we will be using Trim Galore!. Trim Galore! is a wrapper for the reads trimming software cutadapt and fastqc. Read trimming software can be used to trim sequencing adapters and/or low quality reads from the ends of reads. Given we noticed there was some adaptor contamination in our FastQC report, it is a good idea to trim adaptors from our data. Task 2: What type of adapters were used in our data? Hint: Look at the FastQC report ‘Adapter Content’ plot. Now let’s try to use Trim Galore! to remove those problematic adapters. It’s a good idea to check read quality again after trimming, so after you have trimmed your reads you should use FastQC to produce another report. Task 3: Work out the command you should use to trim the adapters from our data. Hint 1: You can use the following command to find out what options you can pass to Trim Galore. trim_galore -h _Hint 2:** Read through the output of the above command carefully. The adaptor used in this experiment is quite common. Do you need to know the actual sequence of the adaptor to remove it? Task 3: Produce a FastQC report for your trimmed reads files. Is the adapter contamination gone? Once you think you have successfully trimmed your reads and have confirmed this by checking the FastQC report, feel free to check your results using the next section. 5.3.1 Solution You can use the command(s) below to trim the Nextera sequencing adapters: mkdir fastqc_trimmed_results trim_galore --nextera -o fastqc_trimmed_results Share/ERR522959_1.fastq Share/ERR522959_2.fastq Remember to generate new FastQC reports for your trimmed reads files! FastQC should now show that your reads pass the ‘Adaptor Content’ plot. Feel free to ask one of the instructors if you have any questions. Congratulations! You have now generated reads quality reports and performed adaptor trimming. In the next lab, we will use STAR and Kallisto to align our trimmed and quality-checked reads to a reference transcriptome. 5.4 Fastp Fastp is an ‘all-in-one’ pre-processing tool to run on fastq files which has integrated a lot aspects of quality profiling for both before and after filtering data (quality curves, base contents, KMER, Q20/Q30, GC Ratio, duplication, adapter contents and etc. Example usage: mkdir fastp_results fastp -i Share/ERR522959_1.fastq -I Share/ERR522959_2.fastq \\ -o fastp_results/ERR522959_1.fastp.fastq -O fastp_results/ERR522959_1.fastp.fastq \\ --length_required 20 --average_qual 20 --detect_adapter_for_pe --correction \\ -h fastp_results/ERR522959.html -j fastp_results/ERR522959.json 5.5 Read alignment and gene expression quantification Now we have trimmed our reads and established that they are of good quality, we would like to map them to a reference genome. This process is known as alignment. Some form of alignment is generally required if we want to quantify gene expression or find genes which are differentially expressed between samples. Many tools have been developed for read alignment. STAR (Dobin et al. 2013) is one of most popularly used tools in RNA-seq read alignment. There are a bunch of alignment and gene expression quantification tools that are designed specifically for single-cell RNA-seq data too. Depending on your single-cell RNA-seq protocols used and the datasets generated, you can go with the following two workflows presented here for full-length or tag-based datasets. Thus, today we will focus on two alignment tools: STAR and Kallisto-BUStools, and we will discuss other available tools at the end of this chapter. 5.6 Full-length transcript datasets If your single-cell RNA-seq dataset is from plate-based protocol like Smart-seq2, then your dataset can be aligned and quantifified just like a bulk RNA-seq datasest. Each cell has a proper pair (if it’s paired-end sequncing) of fastq files (there are no CB/UMI tags in the reads). STAR is a good choice for alignment (other good choices could be Subread or Hisat2). 5.6.1 Using STAR to align reads STAR tries to find the longest possible sequence which matches one or more sequences in the reference genome. For example, in the figure below, we have a read (blue) which spans two exons and an alternative splicing junction (purple). STAR finds that the first part of the read is the same as the sequence of the first exon, whilst the second part of the read matches the sequence in the second exon. Because STAR is able to recognise splicing events in this way, it is described as a ‘splice aware’ aligner. Figure 2.3: Diagram of how STAR performs alignments, taken from Dobin et al. Usually STAR aligns reads to a reference genome, potentially allowing it to detect novel splicing events or chromosomal rearrangements. 5.6.2 Expression quantification Now you have your aligned reads in a .bam file for your single cells. The next step is to quantify the expression level of each gene per cell. We can use one of the tools which has been developed for bulk RNA-seq data, e.g. HT-seq or FeatureCounts which do ‘simple’ counting of reads overlapping with genomic features. Here we demostrate an example with featureCounts, that counts mapped reads for genomic features such as genes, exons, promoter, gene bodies, genomic bins and chromosomal locations. # include multimapping <featureCounts_path>/featureCounts -O -M -Q 30 -p -a hg_annotations.gtf -o outputfile ERR522959.bam # exclude multimapping <featureCounts_path>/featureCounts -Q 30 -p -a hg_annotations.gtf -o outputfile ERR522959.bam Then you will have your read counts gene expression matrix that’s ready for downstream analysis. 5.7 Tag-based datasets If your dataset is tag-based, for example 10X dataset, then you typically have sequences in R1 that entirely encode for read identities such as Cell Barcode and UMI tags. In most cases, all your cell reads (for 1k-10K cells) are in one set of fastq files. Instead of trying to demultiplex all cells into separate fastqs then do alignment and quantification, we can use tools that take care of this for you. In the following steps, we use Kallisto with bustools for generating the gene expression quantification matrix for your tag-based datasets. 5.7.1 Cellranger count If you work with 10X dataset, cellranger count pipeline may just work well for you. It comes with cellranger software suite with convenient features for 10X datasets. It takes the fastqs of a sample, and uses STAR to align all cells’ reads. It also includes reads filtering, barcode counting, and UMI counting. The output of this pipeline includes the aligned.bam file and the quantified gene expression matrix in both filtered and raw format. In V3, it has adopted the EmptyDroplet method (Lun et al., 2018), an algorithm that tries distinguish true cell barcodes from barcodes associated with droplets that did not contain a cell (i.e. empty droplets). (More details in Section 5.9 ). The filtered gene expression matrix by cellranger count V3 only includes the true cell barcodes determined by EmptyDroplet method. 5.7.2 Kallisto/bustools and pseudo-alignment STAR is a reads aligner, whereas Kallisto is a pseudo-aligner (Bray et al. 2016). The main difference between aligners and pseudo-aligners is that whereas aligners map reads to a reference, pseudo-aligners map k-mers to a reference. 5.7.3 What is a k-mer? A k-mer is a sequence of length k derived from a read. For example, imagine we have a read with the sequence ATCCCGGGTTAT and we want to make 7-mers from it. To do this, we would find the first 7-mer by counting the first seven bases of the read. We would find the second 7-mer by moving one base along, then counting the next seven bases. Below shows all the 7-mers that could be derived from our read: ATCCCGGGTTAT ATCCCGG TCCCGGG CCCGGGT CCGGGTT CGGGTTA GGGTTAT 5.7.4 Why map k-mers rather than reads? There are two main reasons: Pseudo-aligners use k-mers and a computational trick to make pseudo-alignment much faster than traditional aligners. If you are interested in how this is acheived, see (Bray et al. 2016) for details. Under some circumstances, pseudo-aligners may be able to cope better with sequencing errors than traditional aligners. For example, imagine there was a sequencing error in the first base of the read above and the A was actually a T. This would impact on the pseudo-aligners ability to map the first 7-mer but none of the following 7-mers. 5.7.5 Kallisto’s pseudo mode Kallisto has a specially designed mode for pseudo-aligning reads from single-cell RNA-seq experiments. Unlike STAR, Kallisto psuedo-aligns to a reference transcriptome rather than a reference genome. This means Kallisto maps reads to splice isoforms rather than genes. Mapping reads to isoforms rather than genes is especially challenging for single-cell RNA-seq for the following reasons: Single-cell RNA-seq is lower coverage than bulk RNA-seq, meaning the total amount of information available from reads is reduced. Many single-cell RNA-seq protocols have 3’ coverage bias, meaning if two isoforms differ only at their 5’ end, it might not be possible to work out which isoform the read came from. Some single-cell RNA-seq protocols have short read lengths, which can also mean it is not possible to work out which isoform the read came from. Kallisto’s pseudo mode takes a slightly different approach to pseudo-alignment. Instead of aligning to isoforms, Kallisto aligns to equivalence classes. Essentially, this means if a read maps to multiple isoforms, Kallisto records the read as mapping to an equivalence class containing all the isoforms it maps to. Figure 2 shows a diagram which helps explain this. Figure 2.4: Overview of kallisto, The input consists of a reference transcriptome and reads from an RNA-seq experiment. (a) An example of a read (in black) and three overlapping transcripts with exonic regions as shown. (b) An index is constructed by creating the transcriptome de Bruijn Graph (T-DBG) where nodes (v1, v2, v3, … ) are k-mers, each transcript corresponds to a colored path as shown and the path cover of the transcriptome induces a k-compatibility class for each k-mer. (c) Conceptually, the k-mers of a read are hashed (black nodes) to find the k-compatibility class of a read. (d) Skipping (black dashed lines) uses the information stored in the T-DBG to skip k-mers that are redundant because they have the same k-compatibility class. (e) The k-compatibility class of the read is determined by taking the intersection of the k-compatibility classes of its constituent k-mers. Taken from Bray et al (2016). Figure 2.5: A diagram explaining Kallisto’s Equivalence Classes, taken from Ntranos et al. Note Instead of using gene or isoform expression estimates in downstream analysis such as clustering, equivalence class counts can be used instead, in this course, we focus on using gene level estimation. 5.7.6 Running kallisto pseudo-alignment and BUStools Today, we will talk about doing single-cell pseudo-alignemnt and gene level quantification with Kallisto|BUStools. See https://pachterlab.github.io/kallisto/manual for details. As for STAR, you will need to produce an index for Kallisto before the pseudo-alignment step. Use the below command to produce the Kallisto index. Use the Kallisto manual (https://pachterlab.github.io/kallisto/manual) to work out what the options do in this command. mkdir indices/Kallisto kallisto index -i indices/Kallisto/GRCm38.idx Share/mouse/Ensembl.GRCm38.96/Mus_musculus.GRCm38.cdna.all.fa.gz In this step, an index is constructed by creating the transcriptome de Bruijn Graph (T-DBG). 5.7.6.1 BUS format BUS is a binary file format designed for UMI-tagged single-cell datasets with pseudo-aligned reads labelled with CB and UMI tags. Figure 2.7: BUS format, taken from Melsted,Páll et al. We do kallisto bus on the fastqs of single cells to generate the BUS file and then use BUStools on the generated bus files to get a gene level quantification. Check the list of technologies supported by Kallisto BUStools by kallisto bus -l Use the below command to perform pseudo-alignment and generate bus files for single-cell sequencing data. -x argument specifies the technology. List of supported single-cell technologies short name description ---------- ----------- 10xv1 10x version 1 chemistry 10xv2 10x version 2 chemistry 10xv3 10x version 3 chemistry CELSeq CEL-Seq CELSeq2 CEL-Seq version 2 DropSeq DropSeq inDrops inDrops SCRBSeq SCRB-Seq SureCell SureCell for ddSEQ mkdir results/Kallisto kallisto bus -i indices/Kallisto/GRCm38.idx -o results/Kallisto/output_bus -x '10xv2' -t 4 \\ SI-GA-G1/W11_S1_L001_R1_001.fastq.gz SI-GA-G1/W11_S1_L001_R2_001.fastq.gz \\ SI-GA-G1/W11_S1_L002_R1_001.fastq.gz SI-GA-G1/W11_S1_L002_R2_001.fastq.gz See https://pachterlab.github.io/kallisto/manual for instructions on creating bus files. 5.7.7 Understanding the Output of Kallisto BUS Pseudo-Alignment The command above should produce 4 files - matrix.ec, transcripts.txt, run_info.json and output.bus transcripts.txt contains a list of transcript, in the same order as in the transcriptome fasta file. matrix.ec contains information about the equivalence classes used. The first number in each row is the equivalence class ID. The second number(s) correspond to the transcript ID(s) in that equivalence class. For example “10 1,2,3†would mean that equivalence class 10 contains transcript IDs 1,2 and 3. The ID numbers correspond to the order that the transcripts appear in transcripts.txt. Zero indexing is used, meaning transcript IDs 1,2 and 3 correspond to the second, third and fourth transcripts in transcripts.txt. output.bus contains the binary formated Cell Barcode and UMI tags and Sets of equivalent classes of transcripts obtained by pseudoalignment.(The fourth column is count of reads with this barcode, UMI, and equivalence class combination, which is ignored as one UMI should stand for one molecule.) run_info.json contains information about how Kallisto was executed and can be ignored. 5.7.8 Running Bustools Inputs: transcripts_to_genes.tsv: a tab delimited file of a specific format: No headers, first column is transcript ID, and second column is the corresponding gene ID. Transcript IDs must be in the same order as in the kallisto index. barcode whitelist: A whitelist that contains all the barcodes known to be present in the kit is provided by 10x and comes with CellRanger. First, bustools runs barcode error correction on the bus file. Then, the corrected bus file is sorted by barcode, UMI, and equivalence classes. After that the UMIs are counted and the counts are collapsed to the gene level. mkdir ./output/out_bustools/genecount ./tmp bustools correct -w ./data/whitelist_v2.txt -p ./output/out_bustools/output.bus | \\ bustools sort -T tmp/ -t 4 -p - | \\ bustools count -o ./output/out_bustools/genecount/genes -g ./output/tr2g_hgmm.tsv \\ -e ./output/out_bustools/matrix.ec -t ./output/out_bustools/transcripts.txt --genecounts - The output includes: genes.barcodes.txt genes.genes.txt genes.mtx 5.7.9 Other alignment and quantification tools available Alevin Alevin is a tool for 10X and Drop-seq data that comes with Salmon which is also a ‘pseudo-aligner’ for transcriptome quantification. Salmon is conceptually simiarly to Kallisto but uses different models for parameter estimation and account for sequence (3’ 5’-end and Fragment GC) bias correction. STARsolo STARsolo is integrated with STAR. It does mapping, demultiplexing and gene quantiï¬cation for droplet-based single-cell RNA-seq (eg. 10X genomics). It follows a similar logic as Cellranger count pipeline which does error correction, UMI deduplication and then quantify expression per gene for each cell by counting reads with different UMIs mapped per gene. STARsolo is potentially ten times faster than Cellranger count. If you are interested, here is a paper by Páll et al that compares performance of workflows in single-cell RNA-seq preprocessing. (https://www.biorxiv.org/content/10.1101/673285v2.full). 5.7.10 Summary Full-transcripts dataset: STAR -> featureCounts Tag-based dataset: Kallisto bus -> Bustools 5.8 Practise 5.8.1 Using STAR One issue with STAR is that it needs a lot of RAM, especially if your reference genome is large (eg. mouse and human). To speed up our analysis today, we will use STAR to align reads to a reference genome. Two steps are required to perform STAR alignment. In the first step, the user provides STAR with reference genome sequences (FASTA) and annotations (GTF), which STAR uses to create a genome index. In the second step, STAR maps the user’s reads data to the genome index. Let’s create the index now. You can obtain genomes for many model organisms from Ensembl (https://www.ensembl.org/info/data/ftp/index.html**. Task 1: Execute the commands below to create the index: mkdir indices mkdir indices/STAR STAR --runThreadN 4 --runMode genomeGenerate --genomeDir indices/STAR --genomeFastaFiles Share/hg19.fa --sjdbGTFfile Share/hg_annotations.gtf Task 2: What does each of the options we used do? Hint: Use the STAR manual to help you (https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf** Task 3: How would the command we used in Task 1 be different if we were aligning to the genome rather than the transcriptome? Now that we have created the index, we can perform the mapping step. Task 4: Try to work out what command you should use to map our trimmed reads (from ERR522959** to the index you created. Use the STAR manual to help you. One you think you know the answer, check whether it matches the solution in the next section and execute the alignment. Task 5: Try to understand the output of your alignment. Talk to one of the instructors if you need help! 5.8.2 Solution for STAR Alignment You can use the folowing commands to perform the mapping step: mkdir results mkdir results/STAR STAR --runThreadN 4 --genomeDir indices/STAR --readFilesIn Share/ERR522959_1.fastq Share/ERR522959_2.fastq \\ --outFileNamePrefix results/STAR/ERR522959 5.9 Identifying cell-containing droplets/microwells For droplet based methods only a fraction of droplets contain both beads and an intact cell. However, biology experiments are messy and some RNA will leak out of dead/damaged cells. So droplets without an intact cell are likely to capture a small amount of the ambient RNA which will end up in the sequencing library and contribute a reads to the final sequencing output. The variation in droplet size, amplification efficiency, and sequencing will lead both “background†and real cells to have a wide range of library sizes. Various approaches have been used to try to distinguish those cell barcodes which correspond to real cells. 5.9.1 ‘Knee’ point One of the most used methods use the total molecules (could be applied to total reads) per barcode and try to find a “break point†between bigger libraries which are cells + some background and smaller libraries assumed to be purely background. Let’s load some example simulated data which contain both large and small cells: umi_per_barcode <- read.table("data/droplet_id_example_per_barcode.txt.gz") truth <- read.delim("data/droplet_id_example_truth.gz", sep=",") Exercise How many unique barcodes were detected? How many true cells are present in the data? To simplify calculations for this section exclude all barcodes with fewer than 10 total molecules. Answer One approach is to look for the inflection point where the total molecules per barcode suddenly drops: barcode_rank <- rank(-umi_per_barcode[,2]) plot(barcode_rank, umi_per_barcode[,2], xlim=c(1,8000)) Here we can see an roughly exponential curve of library sizes, so to make things simpler lets log-transform them. log_lib_size <- log10(umi_per_barcode[,2]) plot(barcode_rank, log_lib_size, xlim=c(1,8000)) That’s better, the “knee†in the distribution is much more pronounced. We could manually estimate where the “knee†is but it much more reproducible to algorithmically identify this point. # inflection point o <- order(barcode_rank) log_lib_size <- log_lib_size[o] barcode_rank <- barcode_rank[o] rawdiff <- diff(log_lib_size)/diff(barcode_rank) inflection <- which(rawdiff == min(rawdiff[100:length(rawdiff)], na.rm=TRUE)) plot(barcode_rank, log_lib_size, xlim=c(1,8000)) abline(v=inflection, col="red", lwd=2) threshold <- 10^log_lib_size[inflection] cells <- umi_per_barcode[umi_per_barcode[,2] > threshold,1] TPR <- sum(cells %in% truth[,1])/length(cells) Recall <- sum(cells %in% truth[,1])/length(truth[,1]) c(TPR, Recall) 5.9.2 Mixture model Another is to fix a mixture model and find where the higher and lower distributions intersect. However, data may not fit the assumed distributions very well: set.seed(-92497) # mixture model require("mixtools") mix <- normalmixEM(log_lib_size) plot(mix, which=2, xlab2="log(mol per cell)") p1 <- dnorm(log_lib_size, mean=mix$mu[1], sd=mix$sigma[1]) p2 <- dnorm(log_lib_size, mean=mix$mu[2], sd=mix$sigma[2]) if (mix$mu[1] < mix$mu[2]) { split <- min(log_lib_size[p2 > p1]) } else { split <- min(log_lib_size[p1 > p2]) } Exercise Identify cells using this split point and calculate the TPR and Recall. Answer 5.9.3 Expected Number of Cells A third method used by CellRanger V2, assumes a ~10-fold range of library sizes for real cells and estimates this range using the expected number of cells. n_cells <- length(truth[,1]) # CellRanger v2 totals <- umi_per_barcode[,2] totals <- sort(totals, decreasing = TRUE) # 99th percentile of top n_cells divided by 10 thresh = totals[round(0.01*n_cells)]/10 plot(totals, xlim=c(1,8000)) abline(h=thresh, col="red", lwd=2) Exercise Identify cells using this threshodl and calculate the TPR and Recall. Answer 5.9.4 EmptyDroplets Finally (EmptyDrops)[https://github.com/MarioniLab/DropletUtils] is what we recommend using in calling cell barcodes for droplet-based single-cell datasets. It should be noted that in cellranger count v3, EmptyDroptlet algoritms has been applied in their filtering for true cell barcode step. Instead of trying to find a ‘threshold’ in UMIs counts for determining true cells, EmptyDroplet uses the full genes x cells molecule count matrix for all droplets and estimates the profile of “background†RNA from those droplets with extremely low counts, then looks for cells with gene-expression profiles which differ from the background. This is combined with an inflection point method since background RNA often looks very similar to the expression profile of the largests cells in a population. As such EmptyDrops is the only method able to identify barcodes for very small cells in highly diverse samples. Below we have provided code for how this method is currently run: library("Matrix") raw.counts <- readRDS("data/pancreas/muraro.rds") library("DropletUtils") example(write10xCounts, echo=FALSE) dir.name <- tmpdir list.files(dir.name) sce <- read10xCounts(dir.name) sce my.counts <- DropletUtils:::simCounts() br.out <- barcodeRanks(my.counts) # Making a plot. plot(br.out$rank, br.out$total, log="xy", xlab="Rank", ylab="Total") o <- order(br.out$rank) lines(br.out$rank[o], br.out$fitted[o], col="red") abline(h=metadata(br.out)$knee, col="dodgerblue", lty=2) abline(h=metadata(br.out)$inflection, col="forestgreen", lty=2) legend("bottomleft", lty=2, col=c("dodgerblue", "forestgreen"), legend=c("knee", "inflection")) # emptyDrops set.seed(100) e.out <- emptyDrops(my.counts) is.cell <- e.out$FDR <= 0.01 sum(is.cell, na.rm=TRUE) plot(e.out$Total, -e.out$LogProb, col=ifelse(is.cell, "red", "black"), xlab="Total UMI count", ylab="-Log Probability") # plot(e.out$Total, -e.out$LogProb, col=ifelse(is.cell, "red", "black"), # xlab="Total UMI count", ylab="-Log Probability") # # cells <- colnames(raw.counts)[is.cell] # # TPR <- sum(cells %in% truth[,1])/length(cells) # Recall <- sum(cells %in% truth[,1])/length(truth[,1]) # c(TPR, Recall) References "], -["quality-control-and-data-visualisation.html", "6 Quality control and data visualisation 6.1 Expression QC overview (UMI) 6.2 Cell QC 6.3 Doublet detection 6.4 Gene QC 6.5 Exercise: Expression QC (Reads) 6.6 Data visualization and exploratory data analysis 6.7 Exercise: Data visualization (Reads)", " 6 Quality control and data visualisation The principle of garbage in, garbage out is at least as strong in single-cell genomics as it is elsewere in science. Effective quality control (QC) is crucial to high-quality scRNA-seq data analysis. We discuss principles and strategies for QC in this chapter, along with some discussion and demonstration of data visualisation approaches. 6.1 Expression QC overview (UMI) 6.1.1 Introduction Once gene expression has been quantified it is summarized as an expression matrix where each row corresponds to a gene (or transcript) and each column corresponds to a single cell. This matrix should be examined to remove poor quality cells which were not detected in either read QC or mapping QC steps. Failure to remove low quality cells at this stage may add technical noise which has the potential to obscure the biological signals of interest in the downstream analysis. Since there is currently no standard method for performing scRNASeq the expected values for the various QC measures that will be presented here can vary substantially from experiment to experiment. Thus, to perform QC we will be looking for cells which are outliers with respect to the rest of the dataset rather than comparing to independent quality standards. Consequently, care should be taken when comparing quality metrics across datasets collected using different protocols. 6.1.2 Tung dataset To illustrate cell QC, we consider a dataset of induced pluripotent stem cells generated from three different individuals (Tung et al. 2017) in Yoav Gilad’s lab at the University of Chicago. The experiments were carried out on the Fluidigm C1 platform and to facilitate the quantification both unique molecular identifiers (UMIs) and ERCC spike-ins were used. The data files are located in the tung folder in your working directory. These files are the copies of the original files made on the 15/03/16. We will use these copies for reproducibility purposes. library(SingleCellExperiment) library(scater) options(stringsAsFactors = FALSE) Load the data and annotations: molecules <- read.table("data/tung/molecules.txt", sep = "\\t") anno <- read.table("data/tung/annotation.txt", sep = "\\t", header = TRUE) Inspect a small portion of the expression matrix head(molecules[ , 1:3]) ## NA19098.r1.A01 NA19098.r1.A02 NA19098.r1.A03 ## ENSG00000237683 0 0 0 ## ENSG00000187634 0 0 0 ## ENSG00000188976 3 6 1 ## ENSG00000187961 0 0 0 ## ENSG00000187583 0 0 0 ## ENSG00000187642 0 0 0 head(anno) ## individual replicate well batch sample_id ## 1 NA19098 r1 A01 NA19098.r1 NA19098.r1.A01 ## 2 NA19098 r1 A02 NA19098.r1 NA19098.r1.A02 ## 3 NA19098 r1 A03 NA19098.r1 NA19098.r1.A03 ## 4 NA19098 r1 A04 NA19098.r1 NA19098.r1.A04 ## 5 NA19098 r1 A05 NA19098.r1 NA19098.r1.A05 ## 6 NA19098 r1 A06 NA19098.r1 NA19098.r1.A06 The data consists of 3 individuals and r length(unique(anno$replicate)) replicates and therefore has r length(unique(anno$batch)) batches in total. We standardize the analysis by using both SingleCellExperiment (SCE) and scater packages. First, create the SCE object: umi <- SingleCellExperiment( assays = list(counts = as.matrix(molecules)), colData = anno ) Remove genes that are not expressed in any cell: keep_feature <- rowSums(counts(umi) > 0) > 0 umi <- umi[keep_feature, ] Define control features (genes) - ERCC spike-ins and mitochondrial genes (provided by the authors): isSpike(umi, "ERCC") <- grepl("^ERCC-", rownames(umi)) isSpike(umi, "MT") <- rownames(umi) %in% c("ENSG00000198899", "ENSG00000198727", "ENSG00000198888", "ENSG00000198886", "ENSG00000212907", "ENSG00000198786", "ENSG00000198695", "ENSG00000198712", "ENSG00000198804", "ENSG00000198763", "ENSG00000228253", "ENSG00000198938", "ENSG00000198840") Calculate the quality metrics: umi <- calculateQCMetrics( umi, feature_controls = list( ERCC = isSpike(umi, "ERCC"), MT = isSpike(umi, "MT") ) ) ## Warning in calculateQCMetrics(umi, feature_controls = list(ERCC = ## isSpike(umi, : spike-in set 'ERCC' overwritten by feature_controls set of ## the same name 6.2 Cell QC 6.2.1 Library size Next we consider the total number of RNA molecules detected per sample (if we were using read counts rather than UMI counts this would be the total number of reads). Wells with few reads/molecules are likely to have been broken or failed to capture a cell, and should thus be removed. hist( umi$total_counts, breaks = 100 ) abline(v = 25000, col = "red") Figure 6.1: Histogram of library sizes for all cells Exercise 1 How many cells does our filter remove? What distribution do you expect that the total number of molecules for each cell should follow? Our answer ## filter_by_total_counts ## FALSE TRUE ## 46 818 6.2.2 Detected genes In addition to ensuring sufficient sequencing depth for each sample, we also want to make sure that the reads are distributed across the transcriptome. Thus, we count the total number of unique genes detected in each sample. hist( umi$total_features_by_counts, breaks = 100 ) abline(v = 7000, col = "red") Figure 6.2: Histogram of the number of detected genes in all cells From the plot we conclude that most cells have between 7,000-10,000 detected genes, which is normal for high-depth scRNA-seq. However, this varies by experimental protocol and sequencing depth. For example, droplet-based methods or samples with lower sequencing-depth typically detect fewer genes per cell. The most notable feature in the above plot is the “heavy tail†on the left hand side of the distribution. If detection rates were equal across the cells then the distribution should be approximately normal. Thus we remove those cells in the tail of the distribution (fewer than 7,000 detected genes). Exercise 2 How many cells does our filter remove? Our answer ## filter_by_expr_features ## FALSE TRUE ## 116 748 6.2.3 ERCCs and MTs Another measure of cell quality is the ratio between ERCC spike-in RNAs and endogenous RNAs. This ratio can be used to estimate the total amount of RNA in the captured cells. Cells with a high level of spike-in RNAs had low starting amounts of RNA, likely due to the cell being dead or stressed which may result in the RNA being degraded. plotColData( umi, x = "total_features_by_counts", y = "pct_counts_MT", colour = "batch" ) Figure 6.3: Percentage of counts in MT genes plotColData( umi, x = "total_features_by_counts", y = "pct_counts_ERCC", colour = "batch" ) Figure 6.4: Percentage of counts in ERCCs The above analysis shows that majority of the cells from NA19098.r2 batch have a very high ERCC/Endo ratio. Indeed, it has been shown by the authors that this batch contains cells of smaller size. Exercise 3 Create filters for removing batch NA19098.r2 and cells with high expression of mitochondrial genes (>10% of total counts in a cell). Our answer ## filter_by_ERCC ## FALSE TRUE ## 96 768 ## filter_by_MT ## FALSE TRUE ## 31 833 Exercise 4 What would you expect to see in the ERCC vs counts plot if you were examining a dataset containing cells of different sizes (eg. normal & senescent cells)? Answer You would expect to see a group corresponding to the smaller cells (normal) with a higher fraction of ERCC reads than a separate group corresponding to the larger cells (senescent). 6.2.4 Cell filtering 6.2.4.1 Manual Now we can define a cell filter based on our previous analysis: umi$use <- ( # sufficient features (genes) filter_by_expr_features & # sufficient molecules counted filter_by_total_counts & # sufficient endogenous RNA filter_by_ERCC & # remove cells with unusual number of reads in MT genes filter_by_MT ) table(umi$use) ## ## FALSE TRUE ## 207 657 6.2.4.2 Automatic Another option available in scater is to conduct PCA on a set of QC metrics and then use automatic outlier detection to identify potentially problematic cells. By default, the following metrics are used for PCA-based outlier detection: pct_counts_top_100_features total_features pct_counts_feature_controls n_detected_feature_controls log10_counts_endogenous_features log10_counts_feature_controls scater first creates a matrix where the rows represent cells and the columns represent the different QC metrics. Then, outlier cells can also be identified by using the mvoutlier package on the QC metrics for all cells. This will identify cells that have substantially different QC metrics from the others, possibly corresponding to low-quality cells. We can visualize any outliers using a principal components plot as shown below: umi <- runPCA( umi, use_coldata = TRUE, detect_outliers = TRUE ) reducedDimNames(umi) ## [1] "PCA_coldata" Column subsetting can then be performed based on the $outlier slot, which indicates whether or not each cell has been designated as an outlier. Automatic outlier detection can be informative, but a close inspection of QC metrics and tailored filtering for the specifics of the dataset at hand is strongly recommended. table(umi$outlier) ## ## FALSE TRUE ## 791 73 Then, we can use a PCA plot to see a 2D representation of the cells ordered by their quality metrics. plotReducedDim( umi, use_dimred = "PCA_coldata", size_by = "total_features_by_counts", shape_by = "use", colour_by = "outlier" ) 6.2.5 Compare filterings Exercise 5 Compare the default, automatic and manual cell filters. Plot a Venn diagram of the outlier cells from these filterings. Hint: Use vennCounts and vennDiagram functions from the limma package to make a Venn diagram. Answer Figure 6.5: Comparison of the default, automatic and manual cell filters 6.3 Doublet detection For droplet-based datasets, there is chance that multiple cells are enclosed in one droplet resulting one cell barcode actually containing read information from multiple cells. One way to find doublets/multiplets in the data is to see if there are cells co-expressing markers of distinct cell types. There are also computational tools available for detecting potential doublets in the cells. A lot of these tools rely on artifical doublets formed from the datasets by randomly joining the expression profiles of two cells. Then the cells are tested against the artificial doublet profiles. We demonstrate the usage of two of these doublet detection tools. 6.3.1 scds scds has two detection methods: co-expression based; binary-classification based. In co-expression based approach, the gene-pairs’ co-expression probablities are estimated based on a binomial model and gene pairs that do not co-expression often get higher scores when they co-expression in some cells. The cells’ doublet scores are derived based on the co-expression of pairs of genes. In the binary classification based approach, artificial doublet clusters are generated and cells are difficult to separte from the artificial doublets get higher doublet scores. library(scds) #- Annotate doublet using co-expression based doublet scoring: umi = cxds(umi) #- Annotate doublet using binary classification based doublet scoring: umi = bcds(umi) ## [1] train-error:0.065830+0.003564 test-error:0.099490+0.021301 ## Multiple eval metrics are present. Will use test_error for early stopping. ## Will train until test_error hasn't improved in 2 rounds. ## ## [2] train-error:0.050057+0.006014 test-error:0.079285+0.013055 ## [3] train-error:0.039643+0.003520 test-error:0.071160+0.011813 ## [4] train-error:0.033855+0.005592 test-error:0.065367+0.015263 ## [5] train-error:0.029079+0.005075 test-error:0.065377+0.010823 ## [6] train-error:0.026621+0.005039 test-error:0.061324+0.010783 ## [7] train-error:0.019819+0.003215 test-error:0.054953+0.011878 ## [8] train-error:0.018662+0.003859 test-error:0.054358+0.014838 ## [9] train-error:0.016493+0.002438 test-error:0.058421+0.010390 ## [10] train-error:0.014901+0.004077 test-error:0.056687+0.009949 ## Stopping. Best iteration: ## [8] train-error:0.018662+0.003859 test-error:0.054358+0.014838 ## ## [1] train-error:0.061921 ## Will train until train_error hasn't improved in 2 rounds. ## ## [2] train-error:0.052083 ## [3] train-error:0.039352 ## [4] train-error:0.031829 #- Combine both annotations into a hybrid annotation umi = cxds_bcds_hybrid(umi) #- Doublet scores are now available via colData: CD = colData(umi) head(cbind(CD$cxds_score,CD$bcds_score, CD$hybrid_score)) ## [,1] [,2] [,3] ## NA19098.r1.A01 4131.405 0.05192234 0.2552833 ## NA19098.r1.A02 4564.089 0.03846648 0.2656644 ## NA19098.r1.A03 2827.904 0.03932181 0.1647904 ## NA19098.r1.A04 4708.213 0.04480528 0.2811814 ## NA19098.r1.A05 6134.590 0.03854402 0.3578605 ## NA19098.r1.A06 5810.730 0.03731131 0.3374924 plotColData( umi, x = "total_features_by_counts", y = "pct_counts_ERCC", colour = "hybrid_score" ) The scds paper features excellent descriptions and evaluations of other currently-available doublet detection methods. 6.3.2 DoubletDetection DoubletDetection is a python module that runs on raw UMI counts data. It generates artificial doublets and then perform cell clustering using the augmented dataset. Cells cluster closely to the artificial doublets across multiple iterations are predicted to be doublets. We provided the python scripts for running DoubletDetection on Tung datasets at ./mig_2019_scrnaseq-workshop/course_files/utils/run_doubletDetection.py python run_doubletDetection.py Here is the prediction results by DoubletDetection: require(UpSetR) ## Loading required package: UpSetR pred_tung <- read.delim(file = "data/doublets/tung.dbls.txt", header = FALSE) dim(pred_tung) ## [1] 864 1 dim(anno) ## [1] 864 5 umi$dbd_dbl <- factor(pred_tung$V1) qc_label <- read.delim(file = "data/qc_ipsc.txt") head(qc_label) ## individual replicate well cell_number concentration tra1.60 ## 1 NA19098 r1 A01 1 1.734785 1 ## 2 NA19098 r1 A02 1 1.723038 1 ## 3 NA19098 r1 A03 1 1.512786 1 ## 4 NA19098 r1 A04 1 1.347492 1 ## 5 NA19098 r1 A05 1 2.313047 1 ## 6 NA19098 r1 A06 1 2.056803 1 qc_label$sample_id <- paste0(qc_label$individual,".",qc_label$replicate,".",qc_label$well) rownames(qc_label) <- qc_label$sample_id umi$cell_number <- as.character(qc_label[umi$sample_id,"cell_number"]) umi$cell_number[qc_label$cell_number==0] <- "no_cell" umi$cell_number[qc_label$cell_number == 1] <- "single_cell" umi$cell_number[qc_label$cell_number>1] <- "multi_cell" multiplot(plotColData( umi, x = "total_features_by_counts", y = "pct_counts_ERCC", colour = "hybrid_score" ), plotColData( umi, x = "total_features_by_counts", y = "pct_counts_ERCC", colour = "dbd_dbl" ), plotColData( umi, x = "total_features_by_counts", y = "pct_counts_ERCC", colour = "cell_number" ),cols =2) doublets <- unique(umi$sample_id[umi$dbd_dbl =="1"], umi$sample_id[umi$hybrid_score > 0.8]) pl_list <- UpSetR::fromList(list(pred = doublets,qc_label = qc_label$sample_id[qc_label$cell_number >1])) UpSetR::upset(pl_list,sets = c("pred","qc_label")) 6.3.2.1 Other tools available: DoubletFinder DoubletCells as part of SimpleSingleCell Scrublet 6.4 Gene QC 6.4.1 Gene expression In addition to removing cells with poor quality, it is usually a good idea to exclude genes where we suspect that technical artefacts may have skewed the results. Moreover, inspection of the gene expression profiles may provide insights about how the experimental procedures could be improved. It is often instructive to consider the number of reads consumed by the top 50 expressed genes. plotHighestExprs(umi, exprs_values = "counts") Figure 6.6: Number of total counts consumed by the top 50 expressed genes The distributions are relatively flat indicating (but not guaranteeing!) good coverage of the full transcriptome of these cells. However, there are several spike-ins in the top 15 genes which suggests a greater dilution of the spike-ins may be preferrable if the experiment is to be repeated. 6.4.2 Gene filtering It is typically a good idea to remove genes whose expression level is considered “undetectableâ€. We define a gene as detectable if at least two cells contain more than 1 transcript from the gene. If we were considering read counts rather than UMI counts a reasonable threshold is to require at least five reads in at least two cells. However, in both cases the threshold strongly depends on the sequencing depth. It is important to keep in mind that genes must be filtered after cell filtering since some genes may only be detected in poor quality cells (note colData(umi)$use filter applied to the umi dataset). keep_feature <- nexprs( umi[,colData(umi)$use], byrow = TRUE, detection_limit = 1 ) >= 2 rowData(umi)$use <- keep_feature table(keep_feature) ## keep_feature ## FALSE TRUE ## 4660 14066 Depending on the cell-type, protocol and sequencing depth, other cut-offs may be appropriate. 6.4.3 Save the data Dimensions of the QCed dataset (do not forget about the gene filter we defined above): dim(umi[rowData(umi)$use, colData(umi)$use]) ## [1] 14066 657 Let’s create an additional slot with log-transformed counts (we will need it in the next chapters) and remove saved PCA results from the reducedDim slot: assay(umi, "logcounts_raw") <- log2(counts(umi) + 1) reducedDim(umi) <- NULL Save the data: saveRDS(umi, file = "data/tung/umi.rds") 6.4.4 Big Exercise Perform exactly the same QC analysis with read counts of the same Blischak data. Use tung/reads.txt file to load the reads. Once you have finished please compare your results to ours (next chapter). 6.4.5 sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] UpSetR_1.4.0 scds_1.0.0 ## [3] limma_3.40.6 scater_1.12.2 ## [5] ggplot2_3.2.1 SingleCellExperiment_1.6.0 ## [7] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 ## [9] BiocParallel_1.18.1 matrixStats_0.55.0 ## [11] Biobase_2.44.0 GenomicRanges_1.36.1 ## [13] GenomeInfoDb_1.20.0 IRanges_2.18.3 ## [15] S4Vectors_0.22.1 BiocGenerics_0.30.0 ## ## loaded via a namespace (and not attached): ## [1] ggbeeswarm_0.6.0 colorspace_1.4-1 ## [3] mvoutlier_2.0.9 class_7.3-15 ## [5] modeltools_0.2-22 rio_0.5.16 ## [7] mclust_5.4.5 XVector_0.24.0 ## [9] pls_2.7-1 BiocNeighbors_1.2.0 ## [11] cvTools_0.3.2 flexmix_2.3-15 ## [13] mvtnorm_1.0-11 ranger_0.11.2 ## [15] splines_3.6.0 sROC_0.1-2 ## [17] robustbase_0.93-5 knitr_1.25 ## [19] zeallot_0.1.0 robCompositions_2.1.0 ## [21] kernlab_0.9-27 cluster_2.1.0 ## [23] rrcov_1.4-7 compiler_3.6.0 ## [25] backports_1.1.4 assertthat_0.2.1 ## [27] Matrix_1.2-17 lazyeval_0.2.2 ## [29] BiocSingular_1.0.0 htmltools_0.3.6 ## [31] tools_3.6.0 rsvd_1.0.2 ## [33] gtable_0.3.0 glue_1.3.1 ## [35] GenomeInfoDbData_1.2.1 dplyr_0.8.3 ## [37] Rcpp_1.0.2 carData_3.0-2 ## [39] cellranger_1.1.0 zCompositions_1.3.2-1 ## [41] vctrs_0.2.0 sgeostat_1.0-27 ## [43] fpc_2.2-3 DelayedMatrixStats_1.6.1 ## [45] lmtest_0.9-37 xfun_0.9 ## [47] laeken_0.5.0 stringr_1.4.0 ## [49] openxlsx_4.1.0.1 lifecycle_0.1.0 ## [51] irlba_2.3.3 DEoptimR_1.0-8 ## [53] zlibbioc_1.30.0 MASS_7.3-51.1 ## [55] zoo_1.8-6 scales_1.0.0 ## [57] VIM_4.8.0 hms_0.5.1 ## [59] RColorBrewer_1.1-2 yaml_2.2.0 ## [61] curl_4.2 NADA_1.6-1 ## [63] gridExtra_2.3 reshape_0.8.8 ## [65] stringi_1.4.3 highr_0.8 ## [67] pcaPP_1.9-73 e1071_1.7-2 ## [69] boot_1.3-20 zip_2.0.4 ## [71] truncnorm_1.0-8 rlang_0.4.0 ## [73] pkgconfig_2.0.3 prabclus_2.3-1 ## [75] bitops_1.0-6 evaluate_0.14 ## [77] lattice_0.20-38 purrr_0.3.2 ## [79] labeling_0.3 cowplot_1.0.0 ## [81] tidyselect_0.2.5 GGally_1.4.0 ## [83] plyr_1.8.4 magrittr_1.5 ## [85] bookdown_0.13 R6_2.4.0 ## [87] pillar_1.4.2 haven_2.1.1 ## [89] foreign_0.8-70 withr_2.1.2 ## [91] survival_2.43-3 abind_1.4-5 ## [93] RCurl_1.95-4.12 sp_1.3-1 ## [95] nnet_7.3-12 tibble_2.1.3 ## [97] crayon_1.3.4 car_3.0-3 ## [99] xgboost_0.90.0.2 rmarkdown_1.15 ## [101] viridis_0.5.1 grid_3.6.0 ## [103] readxl_1.3.1 data.table_1.12.2 ## [105] forcats_0.4.0 diptest_0.75-7 ## [107] vcd_1.4-4 digest_0.6.21 ## [109] tidyr_1.0.0 munsell_0.5.0 ## [111] beeswarm_0.2.3 viridisLite_0.3.0 ## [113] vipor_0.4.5 6.5 Exercise: Expression QC (Reads) library(SingleCellExperiment) library(scater) options(stringsAsFactors = FALSE) reads <- read.table("data/tung/reads.txt", sep = "\\t") anno <- read.table("data/tung/annotation.txt", sep = "\\t", header = TRUE) head(reads[ , 1:3]) ## NA19098.r1.A01 NA19098.r1.A02 NA19098.r1.A03 ## ENSG00000237683 0 0 0 ## ENSG00000187634 0 0 0 ## ENSG00000188976 57 140 1 ## ENSG00000187961 0 0 0 ## ENSG00000187583 0 0 0 ## ENSG00000187642 0 0 0 head(anno) ## individual replicate well batch sample_id ## 1 NA19098 r1 A01 NA19098.r1 NA19098.r1.A01 ## 2 NA19098 r1 A02 NA19098.r1 NA19098.r1.A02 ## 3 NA19098 r1 A03 NA19098.r1 NA19098.r1.A03 ## 4 NA19098 r1 A04 NA19098.r1 NA19098.r1.A04 ## 5 NA19098 r1 A05 NA19098.r1 NA19098.r1.A05 ## 6 NA19098 r1 A06 NA19098.r1 NA19098.r1.A06 reads <- SingleCellExperiment( assays = list(counts = as.matrix(reads)), colData = anno ) keep_feature <- rowSums(counts(reads) > 0) > 0 reads <- reads[keep_feature, ] isSpike(reads, "ERCC") <- grepl("^ERCC-", rownames(reads)) isSpike(reads, "MT") <- rownames(reads) %in% c("ENSG00000198899", "ENSG00000198727", "ENSG00000198888", "ENSG00000198886", "ENSG00000212907", "ENSG00000198786", "ENSG00000198695", "ENSG00000198712", "ENSG00000198804", "ENSG00000198763", "ENSG00000228253", "ENSG00000198938", "ENSG00000198840") reads <- calculateQCMetrics( reads, feature_controls = list( ERCC = isSpike(reads, "ERCC"), MT = isSpike(reads, "MT") ) ) ## Warning in calculateQCMetrics(reads, feature_controls = list(ERCC = ## isSpike(reads, : spike-in set 'ERCC' overwritten by feature_controls set of ## the same name hist( reads$total_counts, breaks = 100 ) abline(v = 1.3e6, col = "red") Figure 6.7: Histogram of library sizes for all cells filter_by_total_counts <- (reads$total_counts > 1.3e6) table(filter_by_total_counts) ## filter_by_total_counts ## FALSE TRUE ## 180 684 hist( reads$total_features_by_counts, breaks = 100 ) abline(v = 7000, col = "red") Figure 6.8: Histogram of the number of detected genes in all cells filter_by_expr_features <- (reads$total_features_by_counts > 7000) table(filter_by_expr_features) ## filter_by_expr_features ## FALSE TRUE ## 116 748 plotColData( reads, x = "total_features_by_counts", y = "pct_counts_MT", colour = "batch" ) Figure 6.9: Percentage of counts in MT genes plotColData( reads, x = "total_features_by_counts", y = "pct_counts_ERCC", colour = "batch" ) Figure 6.10: Percentage of counts in ERCCs filter_by_ERCC <- reads$batch != "NA19098.r2" & reads$pct_counts_ERCC < 25 table(filter_by_ERCC) ## filter_by_ERCC ## FALSE TRUE ## 103 761 filter_by_MT <- reads$pct_counts_MT < 30 table(filter_by_MT) ## filter_by_MT ## FALSE TRUE ## 18 846 reads$use <- ( # sufficient features (genes) filter_by_expr_features & # sufficient molecules counted filter_by_total_counts & # sufficient endogenous RNA filter_by_ERCC & # remove cells with unusual number of reads in MT genes filter_by_MT ) table(reads$use) ## ## FALSE TRUE ## 258 606 reads <- runPCA( reads, use_coldata = TRUE, detect_outliers = TRUE ) reducedDimNames(reads) ## [1] "PCA_coldata" table(reads$outlier) ## ## FALSE TRUE ## 753 111 plotReducedDim( reads, use_dimred = "PCA_coldata", size_by = "total_features_by_counts", shape_by = "use", colour_by = "outlier" ) library(limma) ## ## Attaching package: 'limma' ## The following object is masked from 'package:scater': ## ## plotMDS ## The following object is masked from 'package:BiocGenerics': ## ## plotMA auto <- colnames(reads)[reads$outlier] man <- colnames(reads)[!reads$use] venn.diag <- vennCounts( cbind(colnames(reads) %in% auto, colnames(reads) %in% man) ) vennDiagram( venn.diag, names = c("Automatic", "Manual"), circle.col = c("blue", "green") ) Figure 6.11: Comparison of the default, automatic and manual cell filters plotHighestExprs(reads, exprs_values = "counts") Figure 6.12: Number of total counts consumed by the top 50 expressed genes keep_feature <- nexprs( reads[,colData(reads)$use], byrow = TRUE, detection_limit = 1 ) >= 2 rowData(reads)$use <- keep_feature table(keep_feature) ## keep_feature ## FALSE TRUE ## 2664 16062 dim(reads[rowData(reads)$use, colData(reads)$use]) ## [1] 16062 606 assay(reads, "logcounts_raw") <- log2(counts(reads) + 1) reducedDim(reads) <- NULL saveRDS(reads, file = "data/tung/reads.rds") By comparing Figure 6.5 and Figure 6.11, it is clear that the reads based filtering removed more cells than the UMI based analysis. If you go back and compare the results you should be able to conclude that the ERCC and MT filters are more strict for the reads-based analysis. sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] limma_3.40.6 scater_1.12.2 ## [3] ggplot2_3.2.1 SingleCellExperiment_1.6.0 ## [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 ## [7] BiocParallel_1.18.1 matrixStats_0.55.0 ## [9] Biobase_2.44.0 GenomicRanges_1.36.1 ## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 ## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 ## ## loaded via a namespace (and not attached): ## [1] ggbeeswarm_0.6.0 colorspace_1.4-1 ## [3] mvoutlier_2.0.9 class_7.3-15 ## [5] modeltools_0.2-22 rio_0.5.16 ## [7] mclust_5.4.5 XVector_0.24.0 ## [9] pls_2.7-1 BiocNeighbors_1.2.0 ## [11] cvTools_0.3.2 flexmix_2.3-15 ## [13] mvtnorm_1.0-11 ranger_0.11.2 ## [15] splines_3.6.0 sROC_0.1-2 ## [17] robustbase_0.93-5 knitr_1.25 ## [19] zeallot_0.1.0 robCompositions_2.1.0 ## [21] kernlab_0.9-27 cluster_2.1.0 ## [23] rrcov_1.4-7 compiler_3.6.0 ## [25] backports_1.1.4 assertthat_0.2.1 ## [27] Matrix_1.2-17 lazyeval_0.2.2 ## [29] BiocSingular_1.0.0 htmltools_0.3.6 ## [31] tools_3.6.0 rsvd_1.0.2 ## [33] gtable_0.3.0 glue_1.3.1 ## [35] GenomeInfoDbData_1.2.1 dplyr_0.8.3 ## [37] Rcpp_1.0.2 carData_3.0-2 ## [39] cellranger_1.1.0 zCompositions_1.3.2-1 ## [41] vctrs_0.2.0 sgeostat_1.0-27 ## [43] fpc_2.2-3 DelayedMatrixStats_1.6.1 ## [45] lmtest_0.9-37 xfun_0.9 ## [47] laeken_0.5.0 stringr_1.4.0 ## [49] openxlsx_4.1.0.1 lifecycle_0.1.0 ## [51] irlba_2.3.3 DEoptimR_1.0-8 ## [53] zlibbioc_1.30.0 MASS_7.3-51.1 ## [55] zoo_1.8-6 scales_1.0.0 ## [57] VIM_4.8.0 hms_0.5.1 ## [59] RColorBrewer_1.1-2 yaml_2.2.0 ## [61] curl_4.2 NADA_1.6-1 ## [63] gridExtra_2.3 reshape_0.8.8 ## [65] stringi_1.4.3 highr_0.8 ## [67] pcaPP_1.9-73 e1071_1.7-2 ## [69] boot_1.3-20 zip_2.0.4 ## [71] truncnorm_1.0-8 rlang_0.4.0 ## [73] pkgconfig_2.0.3 prabclus_2.3-1 ## [75] bitops_1.0-6 evaluate_0.14 ## [77] lattice_0.20-38 purrr_0.3.2 ## [79] labeling_0.3 cowplot_1.0.0 ## [81] tidyselect_0.2.5 GGally_1.4.0 ## [83] plyr_1.8.4 magrittr_1.5 ## [85] bookdown_0.13 R6_2.4.0 ## [87] pillar_1.4.2 haven_2.1.1 ## [89] foreign_0.8-70 withr_2.1.2 ## [91] survival_2.43-3 abind_1.4-5 ## [93] RCurl_1.95-4.12 sp_1.3-1 ## [95] nnet_7.3-12 tibble_2.1.3 ## [97] crayon_1.3.4 car_3.0-3 ## [99] rmarkdown_1.15 viridis_0.5.1 ## [101] grid_3.6.0 readxl_1.3.1 ## [103] data.table_1.12.2 forcats_0.4.0 ## [105] diptest_0.75-7 vcd_1.4-4 ## [107] digest_0.6.21 tidyr_1.0.0 ## [109] munsell_0.5.0 beeswarm_0.2.3 ## [111] viridisLite_0.3.0 vipor_0.4.5 6.6 Data visualization and exploratory data analysis 6.6.1 Introduction In this chapter we will continue to work with the filtered Tung dataset produced in the previous chapter. We will explore different ways of visualizing the data to allow you to asses what happened to the expression matrix after the quality control step. scater package provides several very useful functions to simplify visualisation. One important aspect of single-cell RNA-seq is to control for batch effects. Batch effects are technical artefacts that are added to the samples during handling. For example, if two sets of samples were prepared in different labs or even on different days in the same lab, then we may observe greater similarities between the samples that were handled together. In the worst case scenario, batch effects may be mistaken for true biological variation. Data visualisation can help to identify batch effects or other unwanted sources of variation that affect our observed gene expression measurements. The Tung data allows us to explore these issues in a controlled manner since some of the salient aspects of how the samples were handled have been recorded. Ideally, we expect to see batches from the same individual grouping together and distinct groups corresponding to each individual. Data visualisation and exploratory data analysis are invaluable for allowing us to get a “feel†for a dataset. This is an area of data analysis that is perhaps more art than science, but is a crucial aspect of single-cell QC and analysis. library(SingleCellExperiment) library(scater) options(stringsAsFactors = FALSE) umi <- readRDS("data/tung/umi.rds") umi.qc <- umi[rowData(umi)$use, colData(umi)$use] endog_genes <- !rowData(umi.qc)$is_feature_control 6.6.2 PCA plot The easiest way to overview the data is by transforming it using the principal component analysis and then visualize the first two principal components. Principal component analysis (PCA) is a statistical procedure that uses a transformation to convert a set of observations into a set of values of linearly uncorrelated variables called principal components (PCs). The number of principal components is less than or equal to the number of original variables. Mathematically, the PCs correspond to the eigenvectors of the covariance matrix. The eigenvectors are sorted by eigenvalue so that the first principal component accounts for as much of the variability in the data as possible, and each succeeding component in turn has the highest variance possible under the constraint that it is orthogonal to the preceding components (the figure below is taken from here). Figure 6.13: Schematic representation of PCA dimensionality reduction 6.6.2.1 Before QC Without log-transformation: tmp <- runPCA( umi[endog_genes, ], exprs_values = "counts" ) plotPCA( tmp, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) Figure 6.14: PCA plot of the tung data With log-transformation: tmp <- runPCA( umi[endog_genes, ], exprs_values = "logcounts_raw" ) plotPCA( tmp, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) Figure 6.15: PCA plot of the tung data Clearly log-transformation is benefitial for our data - it reduces the variance on the first principal component and already separates some biological effects. Moreover, it makes the distribution of the expression values more normal. In the following analysis and chapters we will be using log-transformed raw counts by default. However, note that just a log-transformation is not enough to account for different technical factors between the cells (e.g. sequencing depth). Therefore, please do not use logcounts_raw for your downstream analysis, instead as a minimum suitable data use the logcounts slot of the SingleCellExperiment object, which not just log-transformed, but also normalised by library size (e.g. CPM normalisation). In the course we use logcounts_raw only for demonstration purposes! 6.6.2.2 After QC tmp <- runPCA( umi.qc[endog_genes, ], exprs_values = "logcounts_raw" ) plotPCA( tmp, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) Figure 6.16: PCA plot of the tung data Comparing Figure 6.15 and Figure 6.16, it is clear that after quality control the NA19098.r2 cells no longer form a group of outliers. By default only the top 500 most variable genes are used by scater to calculate the PCA. This can be adjusted by changing the ntop argument. Exercise 1 How do the PCA plots change if when all 14,066 genes are used? Or when only top 50 genes are used? Why does the fraction of variance accounted for by the first PC change so dramatically? Hint Use ntop argument of the plotPCA function. Our answer Figure 6.17: PCA plot of the tung data (14214 genes) Figure 6.18: PCA plot of the tung data (50 genes) If your answers are different please compare your code with ours (you need to search for this exercise in the opened file). 6.6.3 tSNE map An alternative to PCA for visualizing scRNA-seq data is a tSNE plot. tSNE (t-Distributed Stochastic Neighbor Embedding) converts high-dimensional Euclidean distances between datapoints into conditional probabilities that represent similarities, to produce a low-dimensional representation of high-dimensional data that displays large- and local-scale structure in the dataset. Here, we map high dimensional data ( i.e. our 14,214 dimensional expression matrix) to a 2-dimensional space while preserving local distances between cells. tSNE is almost always used to produce a two-dimensional representation of a high-dimensional dataset; it is only rarely used to generate a reduced-dimension space with more than two dimensions and is typically used only for visulisation as opposed being used as a general dimension-reduction method. Due to the non-linear and stochastic nature of the algorithm, tSNE is more difficult to intuitively interpret than a standard dimensionality reduction method such as PCA. Things to be aware of when using tSNE: tSNE has a tendency to (visually) cluster points; as such, it often creates attractive plots of datasets with distinct cell types, but does look as good when there are continuous changes in the cell population. The hyperparameters really matter: in particular, changing the perplexity parameter can have a large effect on the visulisation produced. Perplexity is a measure of information, but can loosely be thought of as a tuning parameter that controls the number of nearest neighbous for each datapoint. Cluster sizes in a tSNE plot mean nothing. Distances between clusters might not mean anything. Random noise doesn’t always look random. You can see some shapes, sometimes. For more details about how to use tSNE effectively, see this exellent article. In contrast with PCA, tSNE is a stochastic algorithm which means running the method multiple times on the same dataset will result in different plots. To ensure reproducibility, we fix the “seed†of the random-number generator in the code below so that we always get the same plot. 6.6.3.1 Before QC set.seed(123456) tmp <- runTSNE( umi[endog_genes, ], exprs_values = "logcounts_raw", perplexity = 130 ) plotTSNE( tmp, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) Figure 6.19: tSNE map of the tung data 6.6.3.2 After QC set.seed(123456) tmp <- runTSNE( umi.qc[endog_genes, ], exprs_values = "logcounts_raw", perplexity = 130 ) plotTSNE( tmp, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) Figure 6.20: tSNE map of the tung data Interpreting PCA and tSNE plots is often challenging and due to their stochastic and non-linear nature, they are less intuitive. However, in this case it is clear that they provide a similar picture of the data. Comparing Figure 6.19 and 6.20, it is again clear that the samples from NA19098.r2 are no longer outliers after the QC filtering. Furthermore tSNE requires you to provide a value of perplexity which reflects the number of neighbours used to build the nearest-neighbour network; a high value creates a dense network which clumps cells together while a low value makes the network more sparse allowing groups of cells to separate from each other. scater uses a default perplexity of the total number of cells divided by five (rounded down). You can read more about the pitfalls of using tSNE here. UMAP (Uniform Manifold Approximation and Projection) is a newer alternative to tSNE which also often creates attractive visualisations of scRNA-seq data with the benefit of being faster than tSNE to compute and is a “true†dimensionality reduction method. We will look at PCA, tSNE and UMAP plots in subsequent chapters and discuss the topic of dimensionality reduction further in the Latent spaces chapter. Exercise 2 How do the tSNE plots change when a perplexity of 10 or 200 is used? How does the choice of perplexity affect the interpretation of the results? Our answer Figure 6.21: tSNE map of the tung data (perplexity = 10) Figure 6.22: tSNE map of the tung data (perplexity = 200) 6.6.4 Big Exercise Perform the same analysis with read counts of the Blischak data. Use tung/reads.rds file to load the reads SCE object. Once you have finished please compare your results to ours (next chapter). 6.6.5 sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] scater_1.12.2 ggplot2_3.2.1 ## [3] SingleCellExperiment_1.6.0 SummarizedExperiment_1.14.1 ## [5] DelayedArray_0.10.0 BiocParallel_1.18.1 ## [7] matrixStats_0.55.0 Biobase_2.44.0 ## [9] GenomicRanges_1.36.1 GenomeInfoDb_1.20.0 ## [11] IRanges_2.18.3 S4Vectors_0.22.1 ## [13] BiocGenerics_0.30.0 ## ## loaded via a namespace (and not attached): ## [1] Rcpp_1.0.2 rsvd_1.0.2 ## [3] lattice_0.20-38 assertthat_0.2.1 ## [5] digest_0.6.21 R6_2.4.0 ## [7] evaluate_0.14 highr_0.8 ## [9] pillar_1.4.2 zlibbioc_1.30.0 ## [11] rlang_0.4.0 lazyeval_0.2.2 ## [13] irlba_2.3.3 Matrix_1.2-17 ## [15] rmarkdown_1.15 BiocNeighbors_1.2.0 ## [17] labeling_0.3 Rtsne_0.15 ## [19] stringr_1.4.0 RCurl_1.95-4.12 ## [21] munsell_0.5.0 compiler_3.6.0 ## [23] vipor_0.4.5 BiocSingular_1.0.0 ## [25] xfun_0.9 pkgconfig_2.0.3 ## [27] ggbeeswarm_0.6.0 htmltools_0.3.6 ## [29] tidyselect_0.2.5 tibble_2.1.3 ## [31] gridExtra_2.3 GenomeInfoDbData_1.2.1 ## [33] bookdown_0.13 viridisLite_0.3.0 ## [35] crayon_1.3.4 dplyr_0.8.3 ## [37] withr_2.1.2 bitops_1.0-6 ## [39] grid_3.6.0 gtable_0.3.0 ## [41] magrittr_1.5 scales_1.0.0 ## [43] stringi_1.4.3 XVector_0.24.0 ## [45] viridis_0.5.1 DelayedMatrixStats_1.6.1 ## [47] cowplot_1.0.0 tools_3.6.0 ## [49] glue_1.3.1 beeswarm_0.2.3 ## [51] purrr_0.3.2 yaml_2.2.0 ## [53] colorspace_1.4-1 knitr_1.25 6.7 Exercise: Data visualization (Reads) library(scater) options(stringsAsFactors = FALSE) reads <- readRDS("data/tung/reads.rds") reads.qc <- reads[rowData(reads)$use, colData(reads)$use] endog_genes <- !rowData(reads.qc)$is_feature_control tmp <- runPCA( reads[endog_genes, ], exprs_values = "counts" ) plotPCA( tmp, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) Figure 6.23: PCA plot of the tung data tmp <- runPCA( reads[endog_genes, ], exprs_values = "logcounts_raw" ) plotPCA( tmp, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) Figure 6.24: PCA plot of the tung data tmp <- runPCA( reads.qc[endog_genes, ], exprs_values = "logcounts_raw" ) plotPCA( tmp, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) Figure 6.25: PCA plot of the tung data set.seed(123456) tmp <- runTSNE( reads[endog_genes, ], exprs_values = "logcounts_raw", perplexity = 130 ) plotTSNE( tmp, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) Figure 6.26: tSNE map of the tung data set.seed(123456) tmp <- runTSNE( reads.qc[endog_genes, ], exprs_values = "logcounts_raw", perplexity = 130 ) plotTSNE( tmp, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) Figure 6.27: tSNE map of the tung data Figure 6.21: tSNE map of the tung data (perplexity = 10) Figure 6.22: tSNE map of the tung data (perplexity = 200) sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] scater_1.12.2 ggplot2_3.2.1 ## [3] SingleCellExperiment_1.6.0 SummarizedExperiment_1.14.1 ## [5] DelayedArray_0.10.0 BiocParallel_1.18.1 ## [7] matrixStats_0.55.0 Biobase_2.44.0 ## [9] GenomicRanges_1.36.1 GenomeInfoDb_1.20.0 ## [11] IRanges_2.18.3 S4Vectors_0.22.1 ## [13] BiocGenerics_0.30.0 ## ## loaded via a namespace (and not attached): ## [1] Rcpp_1.0.2 rsvd_1.0.2 ## [3] lattice_0.20-38 assertthat_0.2.1 ## [5] digest_0.6.21 R6_2.4.0 ## [7] evaluate_0.14 highr_0.8 ## [9] pillar_1.4.2 zlibbioc_1.30.0 ## [11] rlang_0.4.0 lazyeval_0.2.2 ## [13] irlba_2.3.3 Matrix_1.2-17 ## [15] rmarkdown_1.15 BiocNeighbors_1.2.0 ## [17] labeling_0.3 Rtsne_0.15 ## [19] stringr_1.4.0 RCurl_1.95-4.12 ## [21] munsell_0.5.0 compiler_3.6.0 ## [23] vipor_0.4.5 BiocSingular_1.0.0 ## [25] xfun_0.9 pkgconfig_2.0.3 ## [27] ggbeeswarm_0.6.0 htmltools_0.3.6 ## [29] tidyselect_0.2.5 tibble_2.1.3 ## [31] gridExtra_2.3 GenomeInfoDbData_1.2.1 ## [33] bookdown_0.13 viridisLite_0.3.0 ## [35] crayon_1.3.4 dplyr_0.8.3 ## [37] withr_2.1.2 bitops_1.0-6 ## [39] grid_3.6.0 gtable_0.3.0 ## [41] magrittr_1.5 scales_1.0.0 ## [43] stringi_1.4.3 XVector_0.24.0 ## [45] viridis_0.5.1 DelayedMatrixStats_1.6.1 ## [47] cowplot_1.0.0 tools_3.6.0 ## [49] glue_1.3.1 beeswarm_0.2.3 ## [51] purrr_0.3.2 yaml_2.2.0 ## [53] colorspace_1.4-1 knitr_1.25 References "], -["normalization-confounders-and-batch-correction.html", "7 Normalization, confounders and batch correction 7.1 Normalization theory 7.2 Normalization practice (UMI) 7.3 Normalization practice (Reads) 7.4 Identifying confounding factors 7.5 Identifying confounding factors (Reads) 7.6 Dealing with confounders 7.7 Dealing with confounders (Reads) 7.8 Feature Selection", " 7 Normalization, confounders and batch correction 7.1 Normalization theory 7.1.1 Introduction In this chapter, we will explore approaches to normalization, confounder identification and batch correction for scRNA-seq data. Even in the absence of specific confounding factors, thoughtful normalization of scRNA-seq data is required. The raw count values are not directly comparable between cells, because in general the sequencing depth (number of reads obtained; often called library size) is very different across cells—orders-of-magnitude differences in sequencing depth are commonly observed between cells in an scRNA-seq dataset. If ignored, or not handled correctly, library size differences can be the dominant source of variation between single-cell gene expression profiles, obscuring the biological signal of interest. n Related to library size, differences in library composition can also cause problems when we are trying to compare expression profiles between cells. Normalization can and should also account for differences in library composition. In addition to normalization, it is also useful to identify confounding factors so that they can be accounted for in downstream analyses. In many cases, “accounting†for confounding variables may involve incorporating them as variables in a particular statistical model (e.g. in a differential expression model). In other cases, it may be desirable to “regress out†(either in a literal or figurative sense) confounding factors—the challenge for scRNA-seq data is finding the right model and/or data transformation such that regressing out confounding factors would work as desired. We discuss this further below. The issue of batch effects is just as important for scRNA-seq data as it is in other areas of genomics. Briefly, scRNA-seq and other ’omics assays are sensitive to minor differences in technical features of data generation. As such, even when assaying the same experimental or biological system, measurements taken at difference times and places or by different people will differ substantially. To make valid comparisons between cells, samples or groups, we first need to design our studies to be robust to batch effects and then we need to treat batch effects appropriately in our analyses. In the following sections, we will explore simple size-factor normalizations correcting for library size and composition and also discuss a more recent, conceptually quite different, approach to tackling the problem of library size differences between cells. 7.1.2 Library size Library sizes vary because scRNA-seq data is often sequenced on highly multiplexed platforms the total reads which are derived from each cell may differ substantially. Most scRNA-seq platforms and/or quantification methods currently available produce count values as the “rawâ€, “observedâ€, gene expression values. For such count data, the library size must be corrected for as part of data normalization. One popular strategy, borrowed and extended from the analysis of bulk RNA-seq data, is to multiply or divide each column of the expression matrix (in our setup columns correspond to cells) by a “normalization factor†which is an estimate of the library size relative to the other cells. Many methods to correct for library size have been developed for bulk RNA-seq and can be equally applied to scRNA-seq (eg. UQ, SF, CPM, RPKM, FPKM, TPM). In addition, single-cell specific size-factor normalization methods have been proposed to better handle the characteristics of scRNA-seq data (namely greater sparsity/proportion of zero counts). We will demonstrate use of the size-factor normalization method from the scran package in this chapter. A conceptually different approach to normalization of scRNA-seq data was proposed earlier in 2019 by (Hafemeister and Satija 2019). The idea behind the sctransform approach is to fit a regularized negative binomial model to the raw count data, with library size as the only explanatory variable in the model. The residuals from this model can then be used as normalized and variance-stabilized expression values. We show the use of this method too in this chapter. Some quantification methods (particularly those that quantify transcript-level expression, e.g. Salmon, kallisto) return transcripts-per-million values, TPM (instead of or in addition to count values), which effectively incorporate library size when determining gene expression estimates and thus do not require subsequent normalization for library size. However, TPM values may still be susceptible to library composition biases and so normalization may still be required. 7.1.3 Scaling or size-factor normalization methods The normalization methods discussed in this section all involve dividing the counts for each cell by a constant value to account for library size and, in some cases, library composition. These methods will typically give (adjusted/normalized) counts-per-million (CPM) or transcripts-per-million (TPM) values. Ideally, after applying one of these scaling/size-factor normalization methods, the CPM/TPM values produced are comparable across cells, with the effects of sequencing depth removed. However, even if this is true (i.e. the normalization has worked well), the CPM/TPM values do not have stable variance. Specifically, as the size of the values increases, so does the variance. This feature of the data (heteroskedacity, or asymmetric, heavy-tailed distributions) is problematic for statistical analysis methods that assume homoskedacity, that is that there is no relationship between the mean of expression values and their variance (i.e. just about anything that uses a Gaussian error model). As such, we should apply a variance stabilizing transformation to these data so that we can use standard statistical methods like linear regression and PCA with confidence. Developing a thoroughly effective variance stabilizing transformation is a challenge, so almost universally a log transformation (typically log2) is applied to the CPM/TPM values (the logcounts slot in a SingleCellExperiment object is expected to contain (normalized) log2-scale CPM/TPM values). For high-depth cells and highly-expressed genes this transformation generally works well (as for bulk RNA-seq data), but, as we will discuss below, it often performs sub-optimally for (sparse) scRNA-seq data. 7.1.3.1 CPM The simplest way to normalize this data is to convert it to counts per million (CPM) by dividing each column by its total then multiplying by 1,000,000. Note that spike-ins should be excluded from the calculation of total expression in order to correct for total cell RNA content, therefore we will only use endogenous genes. Example of a CPM function in R (using the scater package): calc_cpm <- function (expr_mat, spikes = NULL) { norm_factor <- colSums(expr_mat[-spikes, ]) return(t(t(expr_mat)/norm_factor)) * 10^6 } One potential drawback of CPM is if your sample contains genes that are both very highly expressed and differentially expressed across the cells. In this case, the total molecules in the cell may depend of whether such genes are on/off in the cell and normalizing by total molecules may hide the differential expression of those genes and/or falsely create differential expression for the remaining genes. Note RPKM, FPKM and TPM are variants on CPM which further adjust counts by the length of the respective gene/transcript. TPM is usually a direct output of a transcript expression quantification method (e.g. Salmon, kallisto, etc). To deal with this potentiality several other measures were devised. 7.1.3.2 RLE (SF) The size factor (SF) was proposed and popularized by DESeq (Anders and Huber 2010). First the geometric mean of each gene across all cells is calculated. The size factor for each cell is the median across genes of the ratio of the expression to the gene’s geometric mean. A drawback to this method is that since it uses the geometric mean only genes with non-zero expression values across all cells can be used in its calculation, making it unadvisable for large low-depth scRNASeq experiments. edgeR & scater call this method RLE for “relative log expression†(to distinguish it from the many other size-factor normalization methods that now exist). Example of a SF function in R (from the edgeR package): calc_sf <- function (expr_mat, spikes = NULL) { geomeans <- exp(rowMeans(log(expr_mat[-spikes, ]))) SF <- function(cnts) { median((cnts/geomeans)[(is.finite(geomeans) & geomeans > 0)]) } norm_factor <- apply(expr_mat[-spikes, ], 2, SF) return(t(t(expr_mat)/norm_factor)) } 7.1.3.3 UQ The upperquartile (UQ) was proposed by (Bullard et al. 2010). Here each column is divided by the 75% quantile of the counts for each library. Often the calculated quantile is scaled by the median across cells to keep the absolute level of expression relatively consistent. A drawback to this method is that for low-depth scRNASeq experiments the large number of undetected genes may result in the 75% quantile being zero (or close to it). This limitation can be overcome by generalizing the idea and using a higher quantile (eg. the 99% quantile is the default in scater) or by excluding zeros prior to calculating the 75% quantile. Example of a UQ function in R (again from the edgeR package): calc_uq <- function (expr_mat, spikes = NULL) { UQ <- function(x) { quantile(x[x > 0], 0.75) } uq <- unlist(apply(expr_mat[-spikes, ], 2, UQ)) norm_factor <- uq/median(uq) return(t(t(expr_mat)/norm_factor)) } 7.1.3.4 TMM Another method is called TMM is the weighted trimmed mean of M-values (to the reference) proposed by (Robinson and Oshlack 2010). The M-values in question are the gene-wise log2-fold changes between individual cells. One cell is used as the reference then the M-values for each other cell is calculated compared to this reference. These values are then trimmed by removing the top and bottom ~30%, and the average of the remaining values is calculated by weighting them to account for the effect of the log scale on variance. Each non-reference cell is multiplied by the calculated factor. Two potential issues with this method are insufficient non-zero genes left after trimming, and the assumption that most genes are not differentially expressed. sizeFactors(umi.qc) <- edgeR::calcNormFactors(counts(umi.qc), method = "TMM") 7.1.3.5 scran scran package implements a variant on CPM size-factor normalization specialized for single-cell data (L. Lun, Bach, and Marioni 2016). Briefly this method deals with the problem of vary large numbers of zero values per cell by pooling cells together calculating a normalization factor (similar to TMM) for the sum of each pool. Since each cell is found in many different pools, cell-specific factors can be deconvoluted from the collection of pool-specific factors using linear algebra. This method applies a “quick cluster†method to get rough clusters of cells to pool together to apply the strategy outlined above. qclust <- quickCluster(umi.qc, min.size = 30) umi.qc <- computeSumFactors(umi.qc, sizes = 15, clusters = qclust) 7.1.4 sctransform The sctransform method is very different from the scaling/size-factor methods discussed above. In their paper, (Hafemeister and Satija 2019) argue that the log-transformation of (normalized) CPM values does not stabilise the variance of expression values, particularly in the case of sparse(r) UMI-count data. Figure 1 of their paper (reproduced below) sets out this argument that strong relationships exist between gene expression and total cell UMI count, even after applying a scaled log-normalization method. Figure 2.4: Reproduction of Figure 1 from Hafemeister and Satija (2019). 33,148 PBMC dataset from 10x genomics. A) Distribution of total UMI counts / cell (’sequencing depth’). B) We placed genesinto six groups, based on their average expression in the dataset. C) For each gene group, we examined the average relationship between observed counts and cell sequencing depth. We fit a smooth line for each gene individually and combined results based on the groupings in (B). Black line shows mean, colored region indicates interquartile range. D) Same as in (C), but showing scaled log-normalized values instead of UMI counts. Values were scaled (z-scored) so that a single y-axis range could be used. E) Relationship between gene variance and cell sequencing depth; Cells were placed into five equal-sized groups based on total UMI counts (group 1 has the greatest depth), and we calculated the total variance of each gene group within each bin. For effectively normalized data, each cell bin should contribute 20% to the variance of each gene group. One effect of the failure of the scaled log-normalization to remove the relationship between total cell UMI count and expression is that dimension-reduction methods (especially PCA) applied to the log-normalized data can return reduced dimension spaces where, very often, the first dimension is highly correlated with total cell UMI count to total cell genes expressed. This effect is noted and discussed by (William Townes et al. 2019). The sctransform solution is to fit a negative binomial (NB) generalized linear model to the UMI counts for each gene, with an intercept term and a coefficient for library size (specifically using log10(total cell UMI count) as a covariate) as parameters in the model. The negative binomial model can account for much more variance in the observed count data than a simpler model like the Poisson can. To avoid overfitting the model to the data, the gene-wise intercept, library size and overdispersion parameters are regularized by fitting a loess (locally-linear smoothing method) to the per-gene estimates from the GLM. Figure 2.5: Reproduction of Figure 2A from Hafemeister and Satija (2019). They fit NB regression models for each gene individually, and bootstrapped the process to measure uncertainty in the resulting parameter estimates. A) Model parameters for 16,809 genes for the NB regression model, plotted as a function of average gene abundance. The color of each point indicates a parameter uncertainty score as determined by bootstrapping (Methods). Pink line shows the regularized parameters obtained via kernel regression. The regularized NB GLM is presented as an attractive middle ground between the (underfit) Poisson model and the (overfit) unregularized NB model. The Pearson residuals from the regularized NB GLM are used as “normalized†expression values for downstream analyses. Figure 2.6: Reproduction of Figure 4 from Hafemeister and Satija (2019). A) For four genes, we show the relationship between cell sequencing depth and molecular counts. White points show the observed data. Background color represents the Pearson residual magnitude under three error models. For MALAT1 (does not vary across cell types) the Poisson error model does not account for overdispersion, and incorrectly infers significant residual variation (biological heterogeneity). For S100A9 (a CD14+ Monocyte marker) and CD74 (expressed in antigen-presenting cells) the non-regularized NB model overfits the data, and collapses biological heterogeneity. For PPBP (a Megakaryocyte marker) both non-regularized models wrongly fit a negative slope. B) Boxplot of Pearson residuals for models shown in A. X-axis range shown is limited to [-8, 25] for visual clarity. The regularized NB GLM also provides a natural way to do feature selection ( i.e. find informative genes) using the deviance of the fitted GLM for each gene. We discuss this further in the Feature Selection section. We find the Pearson residuals from sctransform to be highly suitable as input to visualisation (dimension reduction) and clustering methods. For several other analyses (e.g. differential expression analyses), where statistical models designed for sparse count data are available, we prefer to use approaches that work with the “raw†count data. We are not yet sure how well sctransform performs on full-length transcript (i.e. non-UMI) count data. 7.1.5 Downsampling A final way to correct for library size is to downsample the expression matrix so that each cell has approximately the same total number of molecules. The benefit of this method is that zero values will be introduced by the down sampling thus eliminating any biases due to differing numbers of detected genes. However, the major drawback is that the process is not deterministic so each time the downsampling is run the resulting expression matrix is slightly different. Thus, often analyses must be run on multiple downsamplings to ensure results are robust. Downsampling to the depth of the cell with the lowest sequencing depth (that still passes QC) will typically discard much (most) of the information gathered in a (typically expensive) scRNA-seq experiment. We view this as a heavy price to pay for a normalization method that generally does not seem to outperform alternatives. Thus, we would not recommend downsampling as a normalization strategy for scRNA-seq data unless all alternatives have failed. 7.1.6 Effectiveness To compare the efficiency of different normalization methods we will use visual inspection of PCA plots and calculation of cell-wise relative log expression via scater’s plotRLE() function. Namely, cells with many (few) reads have higher (lower) than median expression for most genes resulting in a positive (negative) RLE across the cell, whereas normalized cells have an RLE close to zero. Example of a RLE function in R: calc_cell_RLE <- function (expr_mat, spikes = NULL) { RLE_gene <- function(x) { if (median(unlist(x)) > 0) { log((x + 1)/(median(unlist(x)) + 1))/log(2) } else { rep(NA, times = length(x)) } } if (!is.null(spikes)) { RLE_matrix <- t(apply(expr_mat[-spikes, ], 1, RLE_gene)) } else { RLE_matrix <- t(apply(expr_mat, 1, RLE_gene)) } cell_RLE <- apply(RLE_matrix, 2, median, na.rm = T) return(cell_RLE) } Note The RLE, TMM, and UQ size-factor methods were developed for bulk RNA-seq data and, depending on the experimental context, may not be appropriate for single-cell RNA-seq data, as their underlying assumptions may be problematically violated. Note The calcNormFactors function from the edgeR package implements several library size normalization methods making it easy to apply any of these methods to our data. Note edgeR makes extra adjustments to some of the normalization methods which may result in somewhat different results than if the original methods are followed exactly, e.g. edgeR’s and scater’s “RLE†method which is based on the “size factor†used by DESeq may give different results to the estimateSizeFactorsForMatrix method in the DESeq/DESeq2 packages. In addition, some (earlier) versions of edgeR will not calculate the normalization factors correctly unless lib.size is set at 1 for all cells. Note For CPM normalisation we use scater’s calculateCPM() function. For RLE, UQ and TMM we used to use scater’s normaliseExprs() function, but it was deprecated and has been removed from the package). For scran we use the scran package to calculate size factors (it also operates on SingleCellExperiment class) and scater’s normalize() to normalise the data. All these normalization functions save the results to the logcounts slot of the SCE object. 7.2 Normalization practice (UMI) We will continue to work with the tung data that was used in the previous chapter. library(scRNA.seq.funcs) library(scater) library(scran) options(stringsAsFactors = FALSE) set.seed(1234567) umi <- readRDS("data/tung/umi.rds") umi.qc <- umi[rowData(umi)$use, colData(umi)$use] endog_genes <- !rowData(umi.qc)$is_feature_control 7.2.1 Raw tmp <- runPCA( umi.qc[endog_genes, ], exprs_values = "logcounts_raw" ) plotPCA( tmp, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) + ggtitle("PCA plot: raw log-counts") Figure 7.1: PCA plot of the tung data 7.2.2 CPM logcounts(umi.qc) <- log2(calculateCPM(umi.qc, use_size_factors = FALSE) + 1) plotPCA( umi.qc[endog_genes, ], colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) + ggtitle("PCA plot: log2(CPM) values") Figure 7.2: PCA plot of the tung data after CPM normalisation plotRLE( umi.qc[endog_genes, ], exprs_values = "logcounts_raw", colour_by = "batch" ) + ggtitle("RLE plot: raw log-counts") Figure 7.3: Cell-wise RLE of the tung data. The relative log expression profile of each cell is represented by a boxplot, which appears as a line here. The grey bar in the middle for each cell represent the interquartile range of the RLE values; the coloured lines represent the whiskers ofof a boxplot and extend above and below the grey bar by 1.5 times the interquartile range. The median RLE value is shown with a circle. plotRLE( umi.qc[endog_genes, ], exprs_values = "logcounts", colour_by = "batch" ) + ggtitle("RLE plot: log2(CPM)") Figure 7.4: Cell-wise RLE of the tung data. The relative log expression profile of each cell is represented by a boxplot, which appears as a line here. The grey bar in the middle for each cell represent the interquartile range of the RLE values; the coloured lines represent the whiskers ofof a boxplot and extend above and below the grey bar by 1.5 times the interquartile range. The median RLE value is shown with a circle. Q: How well would you say the two approaches above normalize the data? 7.2.3 scran scran’s method for size-factor estimation will almost always be preferable for scRNA-seq data to methods that were developed for bulk RNA-seq data (TMM, RLE, UQ). Thus, we will just demonstrate the use of scran size-factor normalization here as representative of size-factor normalization more generally. The code below computes the size factors and then the normalize() function in scater applies those size factors along with the library sizes to the count matrix to produce normalized log2-counts-per-million values that are then stored in the logcounts slot of the SingleCellExperiment object. qclust <- quickCluster(umi.qc, min.size = 30, use.ranks = FALSE) umi.qc <- computeSumFactors(umi.qc, sizes = 15, clusters = qclust) umi.qc <- normalize(umi.qc) ## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its ## own size factors ## Warning in .get_all_sf_sets(object): spike-in set 'MT' should have its own ## size factors plotPCA( umi.qc[endog_genes, ], colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) + ggtitle("PCA plot: scran size-factor normalization") Figure 7.5: PCA plot of the tung data after LSF normalisation plotRLE( umi.qc[endog_genes, ], exprs_values = "logcounts", colour_by = "batch" ) + ggtitle("RLE plot: scran size-factor normalization") Figure 7.6: Cell-wise RLE of the tung data scran sometimes calculates negative or zero size factors. These will completely distort the normalized expression matrix. We can check the size factors scran has computed like so: summary(sizeFactors(umi.qc)) ## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.4836 0.7747 0.9532 1.0000 1.1483 3.2873 For this dataset all the size factors are reasonable so we are done. If you find scran has calculated negative size factors try increasing the cluster and pool sizes until they are all positive. We sometimes filter out cells with very large size-factors (you may like to think about why), but we will not demonstrate that here. 7.2.4 sctransform The sctransform approach to using Pearson residuals from an regularized negative binomial generalized linear model was introduced above. Here we demonstrate how to apply this method. Note that (due to what looks like a bug in this version of sctransform) we need to convert the UMI count matrix to a sparse format to apply sctransform. umi_sparse <- as(counts(umi.qc), "dgCMatrix") ### Genes expressed in at least 5 cells will be kept sctnorm_data <- sctransform::vst(umi = umi_sparse, min_cells = 1, cell_attr = as.data.frame(colData(umi.qc)), latent_var = "log10_total_counts_endogenous") ## Calculating cell attributes for input UMI matrix ## Variance stabilizing transformation of count matrix of size 14066 by 657 ## Model formula is y ~ log10_total_counts_endogenous ## Get Negative Binomial regression parameters per gene ## Using 2000 genes, 657 cells ## | | | 0% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## | |======== | 12% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## | |================ | 25% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## | |======================== | 38% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## | |================================ | 50% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## | |========================================= | 62% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## | |================================================= | 75% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## | |========================================================= | 88% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## | |=================================================================| 100% ## Found 5 outliers - those will be ignored in fitting/regularization step ## Second step: Get residuals using fitted parameters for 14066 genes ## | | | 0% | |= | 2% | |== | 4% | |==== | 5% | |===== | 7% | |====== | 9% | |======= | 11% | |======== | 13% | |========= | 15% | |=========== | 16% | |============ | 18% | |============= | 20% | |============== | 22% | |=============== | 24% | |================= | 25% | |================== | 27% | |=================== | 29% | |==================== | 31% | |===================== | 33% | |====================== | 35% | |======================== | 36% | |========================= | 38% | |========================== | 40% | |=========================== | 42% | |============================ | 44% | |============================== | 45% | |=============================== | 47% | |================================ | 49% | |================================= | 51% | |================================== | 53% | |=================================== | 55% | |===================================== | 56% | |====================================== | 58% | |======================================= | 60% | |======================================== | 62% | |========================================= | 64% | |=========================================== | 65% | |============================================ | 67% | |============================================= | 69% | |============================================== | 71% | |=============================================== | 73% | |================================================ | 75% | |================================================== | 76% | |=================================================== | 78% | |==================================================== | 80% | |===================================================== | 82% | |====================================================== | 84% | |======================================================== | 85% | |========================================================= | 87% | |========================================================== | 89% | |=========================================================== | 91% | |============================================================ | 93% | |============================================================= | 95% | |=============================================================== | 96% | |================================================================ | 98% | |=================================================================| 100% ## Calculating gene attributes ## Wall clock passed: Time difference of 21.43582 secs ## Pearson residuals, or deviance residuals dim(sctnorm_data$y) ## [1] 14066 657 dim(umi.qc) ## [1] 14066 657 sctnorm_data$model_str ## [1] "y ~ log10_total_counts_endogenous" assay(umi.qc, "sctrans_norm") <- sctnorm_data$y Let us look at the NB GLM model parameters estimated by sctransform. #sce$log10_total_counts ## Matrix of estimated model parameters per gene (theta and regression coefficients) sctransform::plot_model_pars(sctnorm_data) We can look at the effect of sctransform’s normalization on three particular genes, ACTB, POU5F1 (aka OCT4) and CD74. ##c('ACTB', 'Rpl10', 'Cd74') genes_plot <- c("ENSG00000075624", "ENSG00000204531", "ENSG00000019582") sctransform::plot_model(sctnorm_data, umi_sparse, genes_plot, plot_residual = TRUE, cell_attr = as.data.frame(colData(umi.qc))) reducedDim(umi.qc, "PCA_sctrans_norm") <- reducedDim( runPCA(umi.qc[endog_genes, ], exprs_values = "sctrans_norm") ) plotReducedDim( umi.qc, use_dimred = "PCA_sctrans_norm", colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) + ggtitle("PCA plot: sctransform normalization") Figure 7.7: PCA plot of the tung data after sctransform normalisation (Pearson residuals). plotRLE( umi.qc[endog_genes, ], exprs_values = "sctrans_norm", colour_by = "batch" ) + ggtitle("RLE plot: sctransform normalization") Figure 7.8: Cell-wise RLE of the tung data 7.2.5 Normalisation for gene/transcript length Some methods combine library size and fragment/gene length normalization such as: RPKM - Reads Per Kilobase Million (for single-end sequencing) FPKM - Fragments Per Kilobase Million (same as RPKM but for paired-end sequencing, makes sure that paired ends mapped to the same fragment are not counted twice) TPM - Transcripts Per Kilobase Million (same as RPKM, but the order of normalizations is reversed - length first and sequencing depth second) These methods are not applicable to our dataset since the end of the transcript which contains the UMI was preferentially sequenced. Furthermore in general these should only be calculated using appropriate quantification software from aligned BAM files not from read counts since often only a portion of the entire gene/transcript is sequenced, not the entire length. If in doubt check for a relationship between gene/transcript length and expression level. However, here we show how these normalisations can be calculated using scater. First, we need to find the effective transcript length in Kilobases. However, our dataset containes only gene IDs, therefore we will be using the gene lengths instead of transcripts. scater uses the biomaRt package, which allows one to annotate genes by other attributes: umi.qc <- getBMFeatureAnnos( umi.qc, filters = "ensembl_gene_id", attributes = c( "ensembl_gene_id", "hgnc_symbol", "chromosome_name", "start_position", "end_position" ), biomart = "ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl", host = "www.ensembl.org" ) # If you have mouse data, change the arguments based on this example: # getBMFeatureAnnos( # object, # filters = "ensembl_transcript_id", # attributes = c( # "ensembl_transcript_id", # "ensembl_gene_id", # "mgi_symbol", # "chromosome_name", # "transcript_biotype", # "transcript_start", # "transcript_end", # "transcript_count" # ), # biomart = "ENSEMBL_MART_ENSEMBL", # dataset = "mmusculus_gene_ensembl", # host = "www.ensembl.org" # ) Some of the genes were not annotated, therefore we filter them out: umi.qc.ann <- umi.qc[!is.na(rowData(umi.qc)$ensembl_gene_id), ] Now we compute the total gene length in Kilobases by using the end_position and start_position fields: eff_length <- abs(rowData(umi.qc.ann)$end_position - rowData(umi.qc.ann)$start_position) / 1000 plot(eff_length, rowMeans(counts(umi.qc.ann))) Figure 7.9: Gene length vs Mean Expression for the raw data There is no relationship between gene length and mean expression so __FPKM__s & __TPM__s are inappropriate for this dataset. This is what we would expect for UMI protocols that tag one end of the transcript. But we will demonstrate them anyway. Note Here calculate the total gene length instead of the total exon length. Many genes will contain lots of introns so their eff_length will be very different from what we have calculated. Please consider our calculation as approximation. If you want to use the total exon lengths, please refer to this page. Now we are ready to perform the normalisations: tpm(umi.qc.ann) <- log2(calculateTPM(umi.qc.ann, eff_length) + 1) Plot the results as a PCA plot: tmp <- runPCA( umi.qc.ann, exprs_values = "tpm", ) plotPCA( tmp, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) Figure 7.10: PCA plot of the tung data after TPM normalisation tpm(umi.qc.ann) <- log2(calculateFPKM(umi.qc.ann, eff_length) + 1) ## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its ## own size factors ## Warning in .get_all_sf_sets(object): spike-in set 'MT' should have its own ## size factors tmp <- runPCA( umi.qc.ann, exprs_values = "tpm", ) plotPCA( tmp, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) Figure 7.11: PCA plot of the tung data after FPKM normalisation Note The PCA looks for differences between cells. Gene length is the same across cells for each gene thus FPKM is almost identical to the CPM plot (it is just rotated) since it performs CPM first then normalizes gene length. Whereas, TPM is different because it weights genes by their length before performing __CPM_**. 7.2.6 Reflection Q: What is your assessment of the performance of these different normalization methods on the data presented here? Q: Which normalization method would you prefer for this dataset? Why? 7.2.7 Exercise Perform the same analysis with read counts of the tung data. Use tung/reads.rds file to load the reads SCE object. Once you have finished please compare your results to ours (next chapter). 7.2.8 sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] scran_1.12.1 scater_1.12.2 ## [3] ggplot2_3.2.1 SingleCellExperiment_1.6.0 ## [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 ## [7] BiocParallel_1.18.1 matrixStats_0.55.0 ## [9] Biobase_2.44.0 GenomicRanges_1.36.1 ## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 ## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 ## [15] scRNA.seq.funcs_0.1.0 ## ## loaded via a namespace (and not attached): ## [1] bitops_1.0-6 bit64_0.9-7 ## [3] httr_1.4.1 progress_1.2.2 ## [5] dynamicTreeCut_1.63-1 backports_1.1.4 ## [7] sctransform_0.2.0 tools_3.6.0 ## [9] R6_2.4.0 irlba_2.3.3 ## [11] hypergeo_1.2-13 vipor_0.4.5 ## [13] DBI_1.0.0 lazyeval_0.2.2 ## [15] colorspace_1.4-1 withr_2.1.2 ## [17] prettyunits_1.0.2 tidyselect_0.2.5 ## [19] gridExtra_2.3 moments_0.14 ## [21] curl_4.2 bit_1.1-14 ## [23] compiler_3.6.0 orthopolynom_1.0-5 ## [25] BiocNeighbors_1.2.0 labeling_0.3 ## [27] bookdown_0.13 scales_1.0.0 ## [29] stringr_1.4.0 digest_0.6.21 ## [31] rmarkdown_1.15 XVector_0.24.0 ## [33] pkgconfig_2.0.3 htmltools_0.3.6 ## [35] limma_3.40.6 highr_0.8 ## [37] rlang_0.4.0 RSQLite_2.1.2 ## [39] DelayedMatrixStats_1.6.1 dplyr_0.8.3 ## [41] RCurl_1.95-4.12 magrittr_1.5 ## [43] BiocSingular_1.0.0 GenomeInfoDbData_1.2.1 ## [45] Matrix_1.2-17 Rcpp_1.0.2 ## [47] ggbeeswarm_0.6.0 munsell_0.5.0 ## [49] viridis_0.5.1 stringi_1.4.3 ## [51] yaml_2.2.0 edgeR_3.26.8 ## [53] MASS_7.3-51.1 zlibbioc_1.30.0 ## [55] Rtsne_0.15 plyr_1.8.4 ## [57] blob_1.2.0 grid_3.6.0 ## [59] listenv_0.7.0 dqrng_0.2.1 ## [61] crayon_1.3.4 contfrac_1.1-12 ## [63] lattice_0.20-38 cowplot_1.0.0 ## [65] hms_0.5.1 locfit_1.5-9.1 ## [67] zeallot_0.1.0 knitr_1.25 ## [69] pillar_1.4.2 igraph_1.2.4.1 ## [71] future.apply_1.3.0 reshape2_1.4.3 ## [73] codetools_0.2-16 biomaRt_2.40.4 ## [75] XML_3.98-1.20 glue_1.3.1 ## [77] evaluate_0.14 deSolve_1.24 ## [79] vctrs_0.2.0 gtable_0.3.0 ## [81] purrr_0.3.2 future_1.14.0 ## [83] assertthat_0.2.1 xfun_0.9 ## [85] rsvd_1.0.2 viridisLite_0.3.0 ## [87] tibble_2.1.3 elliptic_1.4-0 ## [89] memoise_1.1.0 AnnotationDbi_1.46.1 ## [91] beeswarm_0.2.3 globals_0.12.4 ## [93] statmod_1.4.32 7.3 Normalization practice (Reads) Figure 7.12: PCA plot of the tung data Figure 7.13: PCA plot of the tung data after CPM normalisation Figure 7.14: Cell-wise RLE of the tung data Figure 7.15: Cell-wise RLE of the tung data ## Warning: Setting 'use.ranks=TRUE' for the old defaults. ## Set 'use.ranks=FALSE' for the new defaults. ## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its ## own size factors ## Warning in .get_all_sf_sets(object): spike-in set 'MT' should have its own ## size factors Figure 7.16: PCA plot of the tung data after LSF normalisation Figure 7.17: Cell-wise RLE of the tung data Figure 7.18: Cell-wise RLE of the tung data ## Calculating cell attributes for input UMI matrix ## Variance stabilizing transformation of count matrix of size 16062 by 606 ## Model formula is y ~ log10_total_counts_endogenous ## Get Negative Binomial regression parameters per gene ## Using 2000 genes, 606 cells ## | | | 0% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## | |======== | 12% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## | |================ | 25% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## | |======================== | 38% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## | |================================ | 50% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## | |========================================= | 62% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## | |================================================= | 75% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## | |========================================================= | 88% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## | |=================================================================| 100% ## Found 1 outliers - those will be ignored in fitting/regularization step ## Second step: Get residuals using fitted parameters for 16062 genes ## | | | 0% | |= | 2% | |== | 3% | |=== | 5% | |==== | 6% | |===== | 8% | |====== | 10% | |======= | 11% | |======== | 13% | |========= | 14% | |========== | 16% | |=========== | 17% | |============ | 19% | |============= | 21% | |============== | 22% | |=============== | 24% | |================= | 25% | |================== | 27% | |=================== | 29% | |==================== | 30% | |===================== | 32% | |====================== | 33% | |======================= | 35% | |======================== | 37% | |========================= | 38% | |========================== | 40% | |=========================== | 41% | |============================ | 43% | |============================= | 44% | |============================== | 46% | |=============================== | 48% | |================================ | 49% | |================================= | 51% | |================================== | 52% | |=================================== | 54% | |==================================== | 56% | |===================================== | 57% | |====================================== | 59% | |======================================= | 60% | |======================================== | 62% | |========================================= | 63% | |========================================== | 65% | |=========================================== | 67% | |============================================ | 68% | |============================================= | 70% | |============================================== | 71% | |=============================================== | 73% | |================================================ | 75% | |================================================== | 76% | |=================================================== | 78% | |==================================================== | 79% | |===================================================== | 81% | |====================================================== | 83% | |======================================================= | 84% | |======================================================== | 86% | |========================================================= | 87% | |========================================================== | 89% | |=========================================================== | 90% | |============================================================ | 92% | |============================================================= | 94% | |============================================================== | 95% | |=============================================================== | 97% | |================================================================ | 98% | |=================================================================| 100% ## Calculating gene attributes ## Wall clock passed: Time difference of 16.19305 secs ## [1] 16062 606 ## [1] 16062 606 ## [1] "y ~ log10_total_counts_endogenous" Let us look at the NB GLM model parameters estimated by sctransform. We can look at the effect of sctransform’s normalization on three particular genes, ACTB, POU5F1 (aka OCT4) and CD74. Figure 7.19: PCA plot of the tung reads data after sctransform normalisation (Pearson residuals). Figure 7.20: Cell-wise RLE of the tung reads data Figure 7.21: PCA plot of the tung data after TPM normalisation ## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its ## own size factors ## Warning in .get_all_sf_sets(object): spike-in set 'MT' should have its own ## size factors ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] scran_1.12.1 scater_1.12.2 ## [3] ggplot2_3.2.1 SingleCellExperiment_1.6.0 ## [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 ## [7] BiocParallel_1.18.1 matrixStats_0.55.0 ## [9] Biobase_2.44.0 GenomicRanges_1.36.1 ## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 ## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 ## [15] scRNA.seq.funcs_0.1.0 ## ## loaded via a namespace (and not attached): ## [1] bitops_1.0-6 bit64_0.9-7 ## [3] httr_1.4.1 progress_1.2.2 ## [5] dynamicTreeCut_1.63-1 backports_1.1.4 ## [7] sctransform_0.2.0 tools_3.6.0 ## [9] R6_2.4.0 irlba_2.3.3 ## [11] hypergeo_1.2-13 vipor_0.4.5 ## [13] DBI_1.0.0 lazyeval_0.2.2 ## [15] colorspace_1.4-1 withr_2.1.2 ## [17] prettyunits_1.0.2 tidyselect_0.2.5 ## [19] gridExtra_2.3 moments_0.14 ## [21] curl_4.2 bit_1.1-14 ## [23] compiler_3.6.0 orthopolynom_1.0-5 ## [25] BiocNeighbors_1.2.0 labeling_0.3 ## [27] bookdown_0.13 scales_1.0.0 ## [29] stringr_1.4.0 digest_0.6.21 ## [31] rmarkdown_1.15 XVector_0.24.0 ## [33] pkgconfig_2.0.3 htmltools_0.3.6 ## [35] limma_3.40.6 highr_0.8 ## [37] rlang_0.4.0 RSQLite_2.1.2 ## [39] DelayedMatrixStats_1.6.1 dplyr_0.8.3 ## [41] RCurl_1.95-4.12 magrittr_1.5 ## [43] BiocSingular_1.0.0 GenomeInfoDbData_1.2.1 ## [45] Matrix_1.2-17 Rcpp_1.0.2 ## [47] ggbeeswarm_0.6.0 munsell_0.5.0 ## [49] viridis_0.5.1 stringi_1.4.3 ## [51] yaml_2.2.0 edgeR_3.26.8 ## [53] MASS_7.3-51.1 zlibbioc_1.30.0 ## [55] Rtsne_0.15 plyr_1.8.4 ## [57] blob_1.2.0 grid_3.6.0 ## [59] listenv_0.7.0 dqrng_0.2.1 ## [61] crayon_1.3.4 contfrac_1.1-12 ## [63] lattice_0.20-38 cowplot_1.0.0 ## [65] hms_0.5.1 locfit_1.5-9.1 ## [67] zeallot_0.1.0 knitr_1.25 ## [69] pillar_1.4.2 igraph_1.2.4.1 ## [71] future.apply_1.3.0 reshape2_1.4.3 ## [73] codetools_0.2-16 biomaRt_2.40.4 ## [75] XML_3.98-1.20 glue_1.3.1 ## [77] evaluate_0.14 deSolve_1.24 ## [79] vctrs_0.2.0 gtable_0.3.0 ## [81] purrr_0.3.2 future_1.14.0 ## [83] assertthat_0.2.1 xfun_0.9 ## [85] rsvd_1.0.2 viridisLite_0.3.0 ## [87] tibble_2.1.3 elliptic_1.4-0 ## [89] memoise_1.1.0 AnnotationDbi_1.46.1 ## [91] beeswarm_0.2.3 globals_0.12.4 ## [93] statmod_1.4.32 7.4 Identifying confounding factors 7.4.1 Introduction There is a large number of potential confounders, artifacts and biases in scRNA-seq data. One of the main challenges in analysing scRNA-seq data stems from the fact that it is difficult to carry out a true technical replication (why?) to distinguish biological and technical variability. In the previous chapters we considered normalization and in this chapter we will continue to explore how experimental artifacts can be identified and removed. We will continue using the scater package since it provides a set of methods specifically for quality control of experimental and explanatory variables. Moreover, we will continue to work with the Blischak data that was used in the previous chapter. library(scater, quietly = TRUE) library(scran) options(stringsAsFactors = FALSE) umi <- readRDS("data/tung/umi.rds") umi.qc <- umi[rowData(umi)$use, colData(umi)$use] endog_genes <- !rowData(umi.qc)$is_feature_control The umi.qc dataset contains filtered cells and genes. Our next step is to explore technical drivers of variability in the data to inform data normalisation before downstream analysis. 7.4.2 Correlations with PCs Let’s first look again at the PCA plot of the QCed dataset using the scran-normalized log2-CPM values: qclust <- quickCluster(umi.qc, min.size = 30, use.ranks = FALSE) umi.qc <- computeSumFactors(umi.qc, sizes = 15, clusters = qclust) umi.qc <- normalize(umi.qc) ## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its ## own size factors ## Warning in .get_all_sf_sets(object): spike-in set 'MT' should have its own ## size factors reducedDim(umi.qc, "PCA") <- reducedDim( runPCA(umi.qc[endog_genes,], exprs_values = "logcounts", ncomponents = 10), "PCA") plotPCA( umi.qc, colour_by = "batch", size_by = "total_features_by_counts" ) Figure 7.22: PCA plot of the tung data scater allows one to identify principal components that correlate with experimental and QC variables of interest (it ranks principle components by \\(R^2\\) from a linear model regressing PC value against the variable of interest). Let’s test whether some of the variables correlate with any of the PCs. 7.4.2.1 Top colData variables associated with PCs The plot below shows, for each of the first 10 PCs, the variance explained by the ten variables in colData(umi.qc) that are most strongly associated with the PCs. [We will ignore the sample_id variable: it has a unique value for each cell, so can explain all the variation for all PCs.] plotExplanatoryPCs(umi.qc) ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'is_cell_control' with fewer than 2 unique levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_100_features_feature_control' with fewer than 2 ## unique levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_200_features_feature_control' with fewer than 2 ## unique levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_500_features_feature_control' with fewer than 2 ## unique levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_50_features_ERCC' with fewer than 2 unique ## levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_100_features_ERCC' with fewer than 2 unique ## levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_200_features_ERCC' with fewer than 2 unique ## levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_500_features_ERCC' with fewer than 2 unique ## levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_50_features_MT' with fewer than 2 unique levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_100_features_MT' with fewer than 2 unique ## levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_200_features_MT' with fewer than 2 unique ## levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_500_features_MT' with fewer than 2 unique ## levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'use' with fewer than 2 unique levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'outlier' with fewer than 2 unique levels ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf Figure 7.23: PC correlation with the number of detected genes Indeed, we can see that PC1 can be almost completely explained by batch and individual (of course batch is nested within individual). The total counts from ERCC spike-ins also explains a substantial proportion of the variability in PC1. Although number of detected genes is not strongly correlated with the PCs here (after normalization), this is commonly the case and something to look out for. [You might like to replicate the plot above using raw logcounts values to see what happens without normalization]. This is a well-known issue in scRNA-seq and was described here. 7.4.3 Explanatory variables scater can also compute the marginal \\(R^2\\) for each variable when fitting a linear model regressing expression values for each gene against just that variable, and display a density plot of the gene-wise marginal \\(R^2\\) values for the variables. plotExplanatoryVariables( umi.qc, exprs_values = "logcounts_raw", variables = c( "total_features_by_counts", "total_counts", "batch", "individual", "pct_counts_ERCC", "pct_counts_MT" ) ) Figure 7.24: Explanatory variables This analysis indicates that the number of detected genes (again) and also the sequencing depth (total number of UMI counts per cell) have substantial explanatory power for many genes, so these variables are good candidates for conditioning out in a normalization step, or including in downstream statistical models [cf. sctransform’s approach to normalization]. Expression of ERCCs also appears to be an important explanatory variable and one notable feature of the above plot is that batch explains more than individual. What does that tell us about the technical and biological variability of the data? 7.4.4 Other confounders In addition to correcting for batch, there are other factors that one may want to compensate for. As with batch correction, these adjustments require extrinsic information. One popular method is scLVM which allows you to identify and subtract the effect from processes such as cell-cycle or apoptosis. In addition, protocols may differ in terms of their coverage of each transcript, their bias based on the average content of A/T nucleotides, or their ability to capture short transcripts. Ideally, we would like to compensate for all of these differences and biases. 7.4.5 Exercise Perform the same analysis with read counts of the Blischak data. Use tung/reads.rds file to load the reads SCESet object. Once you have finished please compare your results to ours (next chapter). 7.4.6 sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] scran_1.12.1 scater_1.12.2 ## [3] ggplot2_3.2.1 SingleCellExperiment_1.6.0 ## [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 ## [7] BiocParallel_1.18.1 matrixStats_0.55.0 ## [9] Biobase_2.44.0 GenomicRanges_1.36.1 ## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 ## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 ## [15] knitr_1.25 ## ## loaded via a namespace (and not attached): ## [1] locfit_1.5-9.1 Rcpp_1.0.2 ## [3] rsvd_1.0.2 lattice_0.20-38 ## [5] assertthat_0.2.1 digest_0.6.21 ## [7] R6_2.4.0 dynamicTreeCut_1.63-1 ## [9] evaluate_0.14 highr_0.8 ## [11] pillar_1.4.2 zlibbioc_1.30.0 ## [13] rlang_0.4.0 lazyeval_0.2.2 ## [15] irlba_2.3.3 Matrix_1.2-17 ## [17] rmarkdown_1.15 labeling_0.3 ## [19] BiocNeighbors_1.2.0 statmod_1.4.32 ## [21] stringr_1.4.0 igraph_1.2.4.1 ## [23] RCurl_1.95-4.12 munsell_0.5.0 ## [25] compiler_3.6.0 vipor_0.4.5 ## [27] BiocSingular_1.0.0 xfun_0.9 ## [29] pkgconfig_2.0.3 ggbeeswarm_0.6.0 ## [31] htmltools_0.3.6 tidyselect_0.2.5 ## [33] tibble_2.1.3 gridExtra_2.3 ## [35] GenomeInfoDbData_1.2.1 bookdown_0.13 ## [37] edgeR_3.26.8 viridisLite_0.3.0 ## [39] crayon_1.3.4 dplyr_0.8.3 ## [41] withr_2.1.2 bitops_1.0-6 ## [43] grid_3.6.0 gtable_0.3.0 ## [45] magrittr_1.5 scales_1.0.0 ## [47] dqrng_0.2.1 stringi_1.4.3 ## [49] XVector_0.24.0 viridis_0.5.1 ## [51] limma_3.40.6 DelayedMatrixStats_1.6.1 ## [53] cowplot_1.0.0 tools_3.6.0 ## [55] glue_1.3.1 beeswarm_0.2.3 ## [57] purrr_0.3.2 yaml_2.2.0 ## [59] colorspace_1.4-1 7.5 Identifying confounding factors (Reads) ## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its ## own size factors ## Warning in .get_all_sf_sets(object): spike-in set 'MT' should have its own ## size factors Figure 7.25: PCA plot of the tung data ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'is_cell_control' with fewer than 2 unique levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_100_features_feature_control' with fewer than 2 ## unique levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_200_features_feature_control' with fewer than 2 ## unique levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_500_features_feature_control' with fewer than 2 ## unique levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_50_features_ERCC' with fewer than 2 unique ## levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_100_features_ERCC' with fewer than 2 unique ## levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_200_features_ERCC' with fewer than 2 unique ## levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_500_features_ERCC' with fewer than 2 unique ## levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_50_features_MT' with fewer than 2 unique levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_100_features_MT' with fewer than 2 unique ## levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_200_features_MT' with fewer than 2 unique ## levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_500_features_MT' with fewer than 2 unique ## levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'use' with fewer than 2 unique levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'outlier' with fewer than 2 unique levels ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf Figure 7.26: PC correlation with the number of detected genes Figure 7.27: Explanatory variables ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] scran_1.12.1 scater_1.12.2 ## [3] ggplot2_3.2.1 SingleCellExperiment_1.6.0 ## [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 ## [7] BiocParallel_1.18.1 matrixStats_0.55.0 ## [9] Biobase_2.44.0 GenomicRanges_1.36.1 ## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 ## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 ## ## loaded via a namespace (and not attached): ## [1] locfit_1.5-9.1 Rcpp_1.0.2 ## [3] rsvd_1.0.2 lattice_0.20-38 ## [5] assertthat_0.2.1 digest_0.6.21 ## [7] R6_2.4.0 dynamicTreeCut_1.63-1 ## [9] evaluate_0.14 highr_0.8 ## [11] pillar_1.4.2 zlibbioc_1.30.0 ## [13] rlang_0.4.0 lazyeval_0.2.2 ## [15] irlba_2.3.3 Matrix_1.2-17 ## [17] rmarkdown_1.15 labeling_0.3 ## [19] BiocNeighbors_1.2.0 statmod_1.4.32 ## [21] stringr_1.4.0 igraph_1.2.4.1 ## [23] RCurl_1.95-4.12 munsell_0.5.0 ## [25] compiler_3.6.0 vipor_0.4.5 ## [27] BiocSingular_1.0.0 xfun_0.9 ## [29] pkgconfig_2.0.3 ggbeeswarm_0.6.0 ## [31] htmltools_0.3.6 tidyselect_0.2.5 ## [33] tibble_2.1.3 gridExtra_2.3 ## [35] GenomeInfoDbData_1.2.1 bookdown_0.13 ## [37] edgeR_3.26.8 viridisLite_0.3.0 ## [39] crayon_1.3.4 dplyr_0.8.3 ## [41] withr_2.1.2 bitops_1.0-6 ## [43] grid_3.6.0 gtable_0.3.0 ## [45] magrittr_1.5 scales_1.0.0 ## [47] dqrng_0.2.1 stringi_1.4.3 ## [49] XVector_0.24.0 viridis_0.5.1 ## [51] limma_3.40.6 DelayedMatrixStats_1.6.1 ## [53] cowplot_1.0.0 tools_3.6.0 ## [55] glue_1.3.1 beeswarm_0.2.3 ## [57] purrr_0.3.2 yaml_2.2.0 ## [59] colorspace_1.4-1 knitr_1.25 7.6 Dealing with confounders 7.6.1 Introduction In the previous chapter we normalized for library size, effectively removing it as a confounder. Now we will consider removing other less well defined confounders from our data. Technical confounders (aka batch effects) can arise from difference in reagents, isolation methods, the lab/experimenter who performed the experiment, even which day/time the experiment was performed. Accounting for technical confounders, and batch effects particularly, is a large topic that also involves principles of experimental design. Here we address approaches that can be taken to account for confounders when the experimental design is appropriate. Fundamentally, accounting for technical confounders involves identifying and, ideally, removing sources of variation in the expression data that are not related to (i.e. are confounding) the biological signal of interest. Various approaches exist, some of which use spike-in or housekeeping genes, and some of which use endogenous genes. 7.6.1.1 Advantages and disadvantages of using spike-ins to remove confounders The use of spike-ins as control genes is conceptually appealing, since (ideally) the same amount of ERCC (or other) spike-in would be added to each cell in our experiment. In principle, all the variability we observe for these ``genes’’ is due to technical noise; whereas endogenous genes are affected by both technical noise and biological variability. Technical noise can be removed by fitting a model to the spike-ins and “substracting†this from the endogenous genes. There are several methods available based on this premise (eg. BASiCS, scLVM, RUVg); each using different noise models and different fitting procedures. Alternatively, one can identify genes which exhibit significant variation beyond technical noise (eg. Distance to median, Highly variable genes). Unfortunately, there are major issues with the use of spike-ins for normalisation that limit their utility in practice. Perhaps surprisingly, their variability can, for various reasons, actually be higher than that of endogenous genes. One key reason for the difficulty of their use in practice is the need to pipette miniscule volumes of spike-in solution into The most popular set of spike-ins, namely ERCCs, are derived from bacterial sequences, which raises concerns that their base content and structure diverges to far from gene structure in other biological systems of interest (e.g. mammalian genes) to be reliable for normalisation. Even in the best-case scenarios, spike-ins are limited to use on plate-based platforms; they are fundamentally incompatible with droplet-based platforms. Given the issues with using spike-ins, better results can often be obtained by using endogenous genes instead. Given their limited availability, normalisation methods based only on endogenous genes needed to be developed and we consider them generally preferable, even for platforms where spike-ins may be used. Where we have a large number of endogenous genes that, on average, do not vary systematically between cells and where we expect technical effects to affect a large number of genes (a very common and reasonable assumption), then such methods (for example, the RUVs method) can perform well. We explore both general approaches below. library(scRNA.seq.funcs) library(RUVSeq) library(scater) library(SingleCellExperiment) library(scran) library(kBET) library(sva) # Combat library(edgeR) library(harmony) set.seed(1234567) options(stringsAsFactors = FALSE) umi <- readRDS("data/tung/umi.rds") umi.qc <- umi[rowData(umi)$use, colData(umi)$use] endog_genes <- !rowData(umi.qc)$is_feature_control erccs <- rowData(umi.qc)$is_feature_control ## Apply scran sum factor normalization qclust <- quickCluster(umi.qc, min.size = 30, use.ranks = FALSE) umi.qc <- computeSumFactors(umi.qc, sizes = 15, clusters = qclust) umi.qc <- normalize(umi.qc) 7.6.2 Linear models Linear models offer a relatively simple approach to accounting for batch effects and confounders. A linear model can correct for batches while preserving biological effects if you have a balanced design. In a confounded/replicate design biological effects will not be fit/preserved. We could remove batch effects from each individual separately in order to preserve biological (and technical) variance between individuals (we will apply a similar with mnnCorrect, below). Depending on how we have pre-processed our scRNA-seq data or what modelling assumptions we are willing to make, we may choose to use normal (Gaussian) linear models (i.e. assuming a normal distribution for noise) or generalized linear models (GLM), where we can use any distribution from the exponential family. Given that we obtain highly-variable count data from scRNA-seq assays, the obvious choice for a GLM is to use the negative binomial distribution, which has proven highly successful in the analysis of bulk RNA-seq data. For demonstration purposes here we will naively correct all confounded batch effects. 7.6.2.1 Gaussian (normal) linear models The limma package in Bioconductor offers a convenient and efficient means to fit a linear model (with the same design matrix) to a dataset with a large number of features (i.e. genes) (Ritchie et al. 2015). An added advantage of limma is its ability to apply empirical Bayes squeezing of variance estimate to improve inference. Provided we are satisfied making the assumption of a Gaussian distribution for residuals (this may be reasonable for normalized log-counts in many cases; but it may not be—debate continues in the literature), then we can apply limma to regress out (known) unwanted sources of variation as follows. ## fit a model just accounting for batch lm_design_batch <- model.matrix(~0 + batch, data = colData(umi.qc)) fit_lm_batch <- lmFit(logcounts(umi.qc), lm_design_batch) resids_lm_batch <- residuals(fit_lm_batch, logcounts(umi.qc)) assay(umi.qc, "lm_batch") <- resids_lm_batch reducedDim(umi.qc, "PCA_lm_batch") <- reducedDim( runPCA(umi.qc[endog_genes, ], exprs_values = "lm_batch"), "PCA") plotReducedDim(umi.qc, use_dimred = "PCA_lm_batch", colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) + ggtitle("LM - regress out batch") Two problems are immediately apparent with the approach above. First, batch is nested within individual, so simply regressing out batch as we have done above also regresses out differences between individuals that we would like to preserve. Second, we observe that the first principal component seems to separate cells by number of genes (features) expressed, which is undesirable. We can address these concerns by correcting for batch within each individual separately, and also fitting the proportion of genes expressed per cell as a covariate. [NB: to preserve overall differences in expression levels between individuals we will need to apply a slight hack to the LM fit results (setting the intercept coefficient to zero).] Exercise 2 Perform LM correction for each individual separately. Store the final corrected matrix in the lm_batch_indi slot. What do you think of the results of this approach? 7.6.2.2 Negative binomial generalized linear models 7.6.3 sctransform 7.6.4 Remove Unwanted Variation Factors contributing to technical noise frequently appear as “batch effects†where cells processed on different days or by different technicians systematically vary from one another. Removing technical noise and correcting for batch effects can frequently be performed using the same tool or slight variants on it. We will be considering the Remove Unwanted Variation (RUVSeq). Briefly, RUVSeq works as follows. For \\(n\\) samples and \\(J\\) genes, consider the following generalized linear model (GLM), where the RNA-Seq read counts are regressed on both the known covariates of interest and unknown factors of unwanted variation: \\[\\log E[Y|W,X,O] = W\\alpha + X\\beta + O\\] Here, \\(Y\\) is the \\(n \\times J\\) matrix of observed gene-level read counts, \\(W\\) is an \\(n \\times k\\) matrix corresponding to the factors of “unwanted variation†and \\(O\\) is an \\(n \\times J\\) matrix of offsets that can either be set to zero or estimated with some other normalization procedure (such as upper-quartile normalization). The simultaneous estimation of \\(W\\), \\(\\alpha\\), \\(\\beta\\), and \\(k\\) is infeasible. For a given \\(k\\), instead the following three approaches to estimate the factors of unwanted variation \\(W\\) are used: RUVg uses negative control genes (e.g. ERCCs), assumed to have constant expression across samples; RUVs uses centered (technical) replicate/negative control samples for which the covariates of interest are constant; RUVr uses residuals, e.g., from a first-pass GLM regression of the counts on the covariates of interest. We will concentrate on the first two approaches. 7.6.4.1 RUVg To use RUVg we will use ERCCs as negative control genes to anchor the estimation of factors representing unwanted variation. RUVg operates on the raw count data. We adjust the output normalized counts from RUVg so that they represent normalized counts-per-million and then apply a log2 transformation. We run RUVg twice, with \\(k=1\\) and \\(k=10\\) so that we can compare the effect of estimating different number of hidden factors to capture unwanted variation in the data. ruvg <- RUVg(counts(umi.qc), erccs, k = 1) assay(umi.qc, "ruvg1") <- log2( t(t(ruvg$normalizedCounts) / colSums(ruvg$normalizedCounts) * 1e6) + 1 ) ruvg <- RUVg(counts(umi.qc), erccs, k = 10) assay(umi.qc, "ruvg10") <- log2( t(t(ruvg$normalizedCounts) / colSums(ruvg$normalizedCounts) * 1e6) + 1 ) When we assess the effectiveness of various batch correction methods below, you can discuss whether or not you think using ERCCs as negative control genes for a method like RUVg is advisable (in this dataset and in general). 7.6.4.2 RUVs In this application of RUVs we treat the individuals as replicates for which the covariates of interest are constant. As above, we adjust the output normalized counts from RUVs so that they represent normalized counts-per-million and then apply a log2 transformation. Again, we run the method with \\(k=1\\) and \\(k=10\\) so that we can compare the effect of estimating different number of hidden factors. scIdx <- matrix(-1, ncol = max(table(umi.qc$individual)), nrow = 3) tmp <- which(umi.qc$individual == "NA19098") scIdx[1, 1:length(tmp)] <- tmp tmp <- which(umi.qc$individual == "NA19101") scIdx[2, 1:length(tmp)] <- tmp tmp <- which(umi.qc$individual == "NA19239") scIdx[3, 1:length(tmp)] <- tmp cIdx <- rownames(umi.qc) ruvs <- RUVs(counts(umi.qc), cIdx, k = 1, scIdx = scIdx, isLog = FALSE) assay(umi.qc, "ruvs1") <- log2( t(t(ruvs$normalizedCounts) / colSums(ruvs$normalizedCounts) * 1e6) + 1 ) ruvs <- RUVs(counts(umi.qc), cIdx, k = 10, scIdx = scIdx, isLog = FALSE) assay(umi.qc, "ruvs10") <- log2( t(t(ruvs$normalizedCounts) / colSums(ruvs$normalizedCounts) * 1e6) + 1 ) 7.6.5 Combat If you have an experiment with a balanced design, Combat can be used to eliminate batch effects while preserving biological effects by specifying the biological effects using the mod parameter. However the Tung data contains multiple experimental replicates rather than a balanced design so using mod1 to preserve biological variability will result in an error. combat_data <- logcounts(umi.qc) mod_data <- as.data.frame(t(combat_data)) # Basic batch removal mod0 <- model.matrix(~ 1, data = mod_data) # Preserve biological variability mod1 <- model.matrix(~ umi.qc$individual, data = mod_data) # adjust for total genes detected mod2 <- model.matrix(~ umi.qc$total_features_by_counts, data = mod_data) assay(umi.qc, "combat") <- ComBat( dat = t(mod_data), batch = factor(umi.qc$batch), mod = mod0, par.prior = TRUE, prior.plots = FALSE ) Exercise 1 Perform ComBat correction accounting for total features as a co-variate. Store the corrected matrix in the combat_tf slot. 7.6.6 mnnCorrect mnnCorrect (Haghverdi et al. 2017) assumes that each batch shares at least one biological condition with each other batch. Thus it works well for a variety of balanced experimental designs. However, the Tung data contains multiple replicates for each invidividual rather than balanced batches, thus we will normalize each individual separately. Note that this will remove batch effects between batches within the same individual but not the batch effects between batches in different individuals, due to the confounded experimental design. Thus we will merge a replicate from each individual to form three batches. do_mnn <- function(data.qc) { batch1 <- logcounts(data.qc[, data.qc$replicate == "r1"]) batch2 <- logcounts(data.qc[, data.qc$replicate == "r2"]) batch3 <- logcounts(data.qc[, data.qc$replicate == "r3"]) if (ncol(batch2) > 0) { x <- batchelor::mnnCorrect( batch1, batch2, batch3, k = 20, sigma = 0.1, cos.norm.in = TRUE, svd.dim = 2 ) return(x) } else { x <- batchelor::mnnCorrect( batch1, batch3, k = 20, sigma = 0.1, cos.norm.in = TRUE, svd.dim = 2 ) return(x) } } indi1 <- do_mnn(umi.qc[, umi.qc$individual == "NA19098"]) indi2 <- do_mnn(umi.qc[, umi.qc$individual == "NA19101"]) indi3 <- do_mnn(umi.qc[, umi.qc$individual == "NA19239"]) identical(colnames(umi.qc), colnames(cbind(indi1, indi2, indi3))) assay(umi.qc, "mnn") <- assay(cbind(indi1, indi2, indi3), "corrected") # For a balanced design: #assay(umi.qc, "mnn") <- mnnCorrect( # list(B1 = logcounts(batch1), B2 = logcounts(batch2), B3 = logcounts(batch3)), # k = 20, # sigma = 0.1, # cos.norm = TRUE, # svd.dim = 2 #) The latest version of the batchelor package has a new fastMNN() method. The fastMNN() function performs a principal components (PCA). MNN identification and correction is preformed on this low-dimensional representation of the data, an approach that offers some advantages in speed and denoising. The function returns a SingleCellExperiment object containing a matrix of corrected PC scores, which can be used directly for downstream analyses like clustering and visualization. [NB: fastMNN may actually be slower on small datasets like that considered here.] indi1 <- batchelor::fastMNN( umi.qc[, umi.qc$individual == "NA19098"], batch = umi.qc[, umi.qc$individual == "NA19098"]$replicate) indi2 <- batchelor::fastMNN( umi.qc[, umi.qc$individual == "NA19101"], batch = umi.qc[, umi.qc$individual == "NA19101"]$replicate) indi3 <- batchelor::fastMNN( umi.qc[, umi.qc$individual == "NA19239"], batch = umi.qc[, umi.qc$individual == "NA19239"]$replicate) identical(colnames(umi.qc), colnames(cbind(assay(indi1, "reconstructed"), assay(indi2, "reconstructed"), assay(indi3, "reconstructed")))) fastmnn <- cbind(assay(indi1, "reconstructed"), assay(indi2, "reconstructed"), assay(indi3, "reconstructed")) identical(rownames(umi.qc), rownames(fastmnn)) ## fastMNN() drops 66 genes, so we cannot immediately add the reconstructed expression matrix to assays() in umi.qc ## But we can run PCA on the reconstructed data from fastMNN() and add that to the reducedDim slot of our SCE object fastmnn_pca <- runPCA(fastmnn) reducedDim(umi.qc, "fastmnn") <- fastmnn_pca For further details, please consult the batchelor package documentation and vignette. 7.6.7 Harmony Harmony [Korsunsky2018fast] is a newer batch correction method, which is designed to operate on PC space. The algorithm proceeds to iteratively cluster the cells, with the objective function formulated to promote cells from multiple datasets within each cluster. Once a clustering is obtained, the positions of the centroids of each dataset are obtained on a per-cluster basis and the coordinates are corrected. This procedure is iterated until convergence. Harmony comes with a theta parameter that controls the degree of batch correction (higher values lead to more dataset integration), and can account for multiple experimental and biological factors on input. Seeing how the end result of Harmony is an altered dimensional reduction space created on the basis of PCA, we plot the obtained manifold here and exclude it from the rest of the follow-ups in the section. umi.qc.endog <- umi.qc[endog_genes,] umi.qc.endog <- runPCA(umi.qc.endog, exprs_values = 'logcounts', ncomponents = 20) pca <- as.matrix(reducedDim(umi.qc.endog, "PCA")) harmony_emb <- HarmonyMatrix(pca, umi.qc.endog$batch, theta=2, do_pca=FALSE) reducedDim(umi.qc.endog, "harmony") <- harmony_emb plotReducedDim( umi.qc.endog, use_dimred = 'harmony', colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) reducedDim(umi.qc, "harmony") <- reducedDim(umi.qc.endog, "harmony") 7.6.8 How to evaluate and compare confounder removal strategies A key question when considering the different methods for removing confounders is how to quantitatively determine which one is the most effective. The main reason why comparisons are challenging is because it is often difficult to know what corresponds to technical counfounders and what is interesting biological variability. Here, we consider three different metrics which are all reasonable based on our knowledge of the experimental design. Depending on the biological question that you wish to address, it is important to choose a metric that allows you to evaluate the confounders that are likely to be the biggest concern for the given situation. 7.6.8.1 Effectiveness 1 We evaluate the effectiveness of the normalization by inspecting the PCA plot where colour corresponds the technical replicates and shape corresponds to different biological samples (individuals). Separation of biological samples and interspersed batches indicates that technical variation has been removed. We always use log2-cpm normalized data to match the assumptions of PCA. for (nm in assayNames(umi.qc)) { cat(nm, " \\n") tmp <- runPCA( umi.qc[endog_genes, ], exprs_values = nm ) reducedDim(umi.qc, paste0("PCA_", nm)) <- reducedDim(tmp, "PCA") } for (nm in reducedDimNames(umi.qc)) { print( plotReducedDim( umi.qc, use_dimred = nm, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) + ggtitle(nm) ) } Exercise 3 Consider different ks for RUV normalizations. Which gives the best results? 7.6.8.2 Effectiveness 2 We can also examine the effectiveness of correction using the relative log expression (RLE) across cells to confirm technical noise has been removed from the dataset. Note RLE only evaluates whether the number of genes higher and lower than average are equal for each cell - i.e. systemic technical effects. Random technical noise between batches may not be detected by RLE. res <- list() for(n in assayNames(umi.qc)) { res[[n]] <- suppressWarnings(calc_cell_RLE(assay(umi.qc, n), erccs)) } par(mar=c(6,4,1,1)) boxplot(res, las=2) 7.6.8.3 Effectiveness 3 Another method to check the efficacy of batch-effect correction is to consider the intermingling of points from different batches in local subsamples of the data. If there are no batch-effects then proportion of cells from each batch in any local region should be equal to the global proportion of cells in each batch. kBET (Buttner et al. 2017) takes kNN networks around random cells and tests the number of cells from each batch against a binomial distribution. The rejection rate of these tests indicates the severity of batch-effects still present in the data (high rejection rate = strong batch effects). kBET assumes each batch contains the same complement of biological groups, thus it can only be applied to the entire dataset if a perfectly balanced design has been used. However, kBET can also be applied to replicate-data if it is applied to each biological group separately. In the case of the Tung data, we will apply kBET to each individual independently to check for residual batch effects. However, this method will not identify residual batch-effects which are confounded with biological conditions. In addition, kBET does not determine if biological signal has been preserved. compare_kBET_results <- function(sce){ indiv <- unique(sce$individual) norms <- assayNames(sce) # Get all normalizations results <- list() for (i in indiv){ for (j in norms){ tmp <- kBET( df = t(assay(sce[,sce$individual== i], j)), batch = sce$batch[sce$individual==i], heuristic = TRUE, verbose = FALSE, addTest = FALSE, plot = FALSE) results[[i]][[j]] <- tmp$summary$kBET.observed[1] } } return(as.data.frame(results)) } eff_debatching <- compare_kBET_results(umi.qc) require("reshape2") require("RColorBrewer") # Plot results dod <- melt(as.matrix(eff_debatching), value.name = "kBET") colnames(dod)[1:2] <- c("Normalisation", "Individual") colorset <- c('gray', brewer.pal(n = 9, "Oranges")) ggplot(dod, aes(Normalisation, Individual, fill=kBET)) + geom_tile() + scale_fill_gradient2( na.value = "gray", low = colorset[2], mid=colorset[6], high = colorset[10], midpoint = 0.5, limit = c(0,1)) + scale_x_discrete(expand = c(0, 0)) + scale_y_discrete(expand = c(0, 0)) + theme( axis.text.x = element_text( angle = 45, vjust = 1, size = 12, hjust = 1 ) ) + ggtitle("Effect of batch regression methods per individual") Exercise 4 Why do the raw counts appear to have little batch effects? 7.6.9 Big Exercise Perform the same analysis with read counts of the tung data. Use tung/reads.rds file to load the reads SCE object. Once you have finished please compare your results to ours (next chapter). Additionally, experiment with other combinations of normalizations and compare the results. 7.6.10 sessionInfo() 7.7 Dealing with confounders (Reads) library(scRNA.seq.funcs) library(RUVSeq) library(scater) library(SingleCellExperiment) library(scran) library(kBET) library(sva) # Combat library(harmony) library(edgeR) set.seed(1234567) options(stringsAsFactors = FALSE) reads <- readRDS("data/tung/reads.rds") reads.qc <- reads[rowData(reads)$use, colData(reads)$use] endog_genes <- !rowData(reads.qc)$is_feature_control erccs <- rowData(reads.qc)$is_feature_control qclust <- quickCluster(reads.qc, min.size = 30) reads.qc <- computeSumFactors(reads.qc, sizes = 15, clusters = qclust) reads.qc <- normalize(reads.qc) ruvg <- RUVg(counts(reads.qc), erccs, k = 1) assay(reads.qc, "ruvg1") <- log2( t(t(ruvg$normalizedCounts) / colSums(ruvg$normalizedCounts) * 1e6) + 1 ) ruvg <- RUVg(counts(reads.qc), erccs, k = 10) assay(reads.qc, "ruvg10") <- log2( t(t(ruvg$normalizedCounts) / colSums(ruvg$normalizedCounts) * 1e6) + 1 ) scIdx <- matrix(-1, ncol = max(table(reads.qc$individual)), nrow = 3) tmp <- which(reads.qc$individual == "NA19098") scIdx[1, 1:length(tmp)] <- tmp tmp <- which(reads.qc$individual == "NA19101") scIdx[2, 1:length(tmp)] <- tmp tmp <- which(reads.qc$individual == "NA19239") scIdx[3, 1:length(tmp)] <- tmp cIdx <- rownames(reads.qc) ruvs <- RUVs(counts(reads.qc), cIdx, k = 1, scIdx = scIdx, isLog = FALSE) assay(reads.qc, "ruvs1") <- log2( t(t(ruvs$normalizedCounts) / colSums(ruvs$normalizedCounts) * 1e6) + 1 ) ruvs <- RUVs(counts(reads.qc), cIdx, k = 10, scIdx = scIdx, isLog = FALSE) assay(reads.qc, "ruvs10") <- log2( t(t(ruvs$normalizedCounts) / colSums(ruvs$normalizedCounts) * 1e6) + 1 ) combat_data <- logcounts(reads.qc) mod_data <- as.data.frame(t(combat_data)) # Basic batch removal mod0 = model.matrix(~ 1, data = mod_data) # Preserve biological variability mod1 = model.matrix(~ reads.qc$individual, data = mod_data) # adjust for total genes detected mod2 = model.matrix(~ reads.qc$total_features_by_counts, data = mod_data) assay(reads.qc, "combat") <- ComBat( dat = t(mod_data), batch = factor(reads.qc$batch), mod = mod0, par.prior = TRUE, prior.plots = FALSE ) Exercise 1 do_mnn <- function(data.qc) { batch1 <- logcounts(data.qc[, data.qc$replicate == "r1"]) batch2 <- logcounts(data.qc[, data.qc$replicate == "r2"]) batch3 <- logcounts(data.qc[, data.qc$replicate == "r3"]) if (ncol(batch2) > 0) { x <- batchelor::mnnCorrect( batch1, batch2, batch3, k = 20, sigma = 0.1, cos.norm.in = TRUE, svd.dim = 2 ) return(x) } else { x <- batchelor::mnnCorrect( batch1, batch3, k = 20, sigma = 0.1, cos.norm.in = TRUE, svd.dim = 2 ) return(x) } } indi1 <- do_mnn(reads.qc[, reads.qc$individual == "NA19098"]) indi2 <- do_mnn(reads.qc[, reads.qc$individual == "NA19101"]) indi3 <- do_mnn(reads.qc[, reads.qc$individual == "NA19239"]) assay(reads.qc, "mnn") <- cbind(indi1, indi2, indi3) # For a balanced design: #assay(reads.qc, "mnn") <- mnnCorrect( # list(B1 = logcounts(batch1), B2 = logcounts(batch2), B3 = logcounts(batch3)), # k = 20, # sigma = 0.1, # cos.norm = TRUE, # svd.dim = 2 #) glm_fun <- function(g, batch, indi) { model <- glm(g ~ batch + indi) model$coef[1] <- 0 # replace intercept with 0 to preserve reference batch. return(model$coef) } effects <- apply( logcounts(reads.qc), 1, glm_fun, batch = reads.qc$batch, indi = reads.qc$individual ) corrected <- logcounts(reads.qc) - t(effects[as.numeric(factor(reads.qc$batch)), ]) assay(reads.qc, "glm") <- corrected Exercise 2 reads.qc.endog = reads.qc[endog_genes,] reads.qc.endog = runPCA(reads.qc.endog, exprs_values = 'logcounts', ncomponents = 20) pca <- as.matrix(reads.qc.endog@reducedDims@listData[["PCA"]]) harmony_emb <- HarmonyMatrix(pca, reads.qc.endog$batch, theta=2, do_pca=FALSE) reads.qc.endog@reducedDims@listData[['harmony']] <- harmony_emb plotReducedDim( reads.qc.endog, use_dimred = 'harmony', colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) for(n in assayNames(reads.qc)) { tmp <- runPCA( reads.qc[endog_genes, ], exprs_values = n ) print( plotPCA( tmp, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) + ggtitle(n) ) } res <- list() for(n in assayNames(reads.qc)) { res[[n]] <- suppressWarnings(calc_cell_RLE(assay(reads.qc, n), erccs)) } par(mar=c(6,4,1,1)) boxplot(res, las=2) compare_kBET_results <- function(sce){ indiv <- unique(sce$individual) norms <- assayNames(sce) # Get all normalizations results <- list() for (i in indiv){ for (j in norms){ tmp <- kBET( df = t(assay(sce[,sce$individual== i], j)), batch = sce$batch[sce$individual==i], heuristic = TRUE, verbose = FALSE, addTest = FALSE, plot = FALSE) results[[i]][[j]] <- tmp$summary$kBET.observed[1] } } return(as.data.frame(results)) } eff_debatching <- compare_kBET_results(reads.qc) require("reshape2") require("RColorBrewer") # Plot results dod <- melt(as.matrix(eff_debatching), value.name = "kBET") colnames(dod)[1:2] <- c("Normalisation", "Individual") colorset <- c('gray', brewer.pal(n = 9, "RdYlBu")) ggplot(dod, aes(Normalisation, Individual, fill=kBET)) + geom_tile() + scale_fill_gradient2( na.value = "gray", low = colorset[2], mid=colorset[6], high = colorset[10], midpoint = 0.5, limit = c(0,1)) + scale_x_discrete(expand = c(0, 0)) + scale_y_discrete(expand = c(0, 0)) + theme( axis.text.x = element_text( angle = 45, vjust = 1, size = 12, hjust = 1 ) ) + ggtitle("Effect of batch regression methods per individual") 7.8 Feature Selection library(scRNA.seq.funcs) library(matrixStats) library(M3Drop) library(RColorBrewer) library(SingleCellExperiment) set.seed(1) Single-cell RNASeq is capable of measuring the expression of many thousands of genes in every cell. However, in most situations only a portion of those will show a response to the biological condition of interest, e.g. differences in cell-type, drivers of differentiation, respond to an environmental stimulus. Most genes detected in a scRNASeq experiment will only be detected at different levels due to technical noise. One consequence of this is that technical noise and batch effects can obscure the biological signal of interest. Thus, it is often advantageous to perform feature selection to remove those genes which only exhibit technical noise from downstream analysis. Not only does this generally increase the signal:noise ratio in the data; it also reduces the computational complexity of analyses, by reducing the total amount of data to be processed. For scRNASeq data, we will be focusing on unsupervised methods of feature selection which don’t require any a priori information, such as cell-type labels or biological group, since they are not available, or may be unreliable, for many experiments. In contrast, differential expression (chapter 12) can be considered a form of supervised feature selection since it uses the known biological label of each sample to identify features (i.e. genes) which are expressed at different levels across groups. For this section we will continue working with the Deng data. deng <- readRDS("data/deng/deng-reads.rds") celltype_labs <- colData(deng)$cell_type2 cell_colors <- brewer.pal(max(3,length(unique(celltype_labs))), "Set3") Feature selection is performed after QC, however this data has already been QCed so we can skip that step here. M3Drop contain two different feature selection methods “M3DropFeatureSelection†which is based on a Michaelis-Menten curve and is designed for full-transcript single-cell RNA-seq data (such as Smartseq2) and “NBumiFeatureSelectionCombinedDrop†which is based on a negative binomial model and is designed for UMI count data. We will demonstrate both on the Deng Smartseq2 data. M3Drop feature selection is runs direction on a normalized (but not log-transformed) expression matrix. This can be extracted from our SingleCellExperiment object using the command below. expr_matrix <- M3Drop::M3DropConvertData(deng) This function is compatible with most single-cell RNA-seq analysis packages including: scater, SingleCellExperiment, monocle, and Seurat. It can also convert an existing expression matrix to the correct form (removing undetected genes & normalizing/delogging) if you specify whether the matrix is raw counts, or log transformed. Check the manual for details: ?M3Drop::M3DropConvertData Exercise 1: Confirm that the conversion function has removed undetected genes: 7.8.1 Identifying Genes vs a Null Model There are two main approaches to unsupervised feature selection. The first is to identify genes which behave differently from a null model describing just the technical noise expected in the dataset. If the dataset contains spike-in RNAs they can be used to directly model technical noise. However, measurements of spike-ins may not experience the same technical noise as endogenous transcripts (Svensson et al., 2017). In addition, scRNASeq experiments often contain only a small number of spike-ins which reduces our confidence in fitted model parameters. 7.8.1.1 Highly Variable Genes The first method proposed to identify features in scRNASeq datasets was to identify highly variable genes (HVG). HVG assumes that if genes have large differences in expression across cells some of those differences are due to biological difference between the cells rather than technical noise. However, because of the nature of count data, there is a positive relationship between the mean expression of a gene and the variance in the read counts across cells. This relationship must be corrected for to properly identify HVGs. Exercise 2 Using the functions rowMeans and rowVars to plot the relationship between mean expression and variance for all genes in this dataset. (Hint: use log=“xy†to plot on a log-scale). A popular method to correct for the relationship between variance and mean expression was proposed by Brennecke et al.. To use the Brennecke method, we first normalize for library size then calculate the mean and the square coefficient of variation (variation divided by the squared mean expression). A quadratic curve is fit to the relationship between these two variables for the ERCC spike-in, and then a chi-square test is used to find genes significantly above the curve. This method is included in the M3Drop package as the Brennecke_getVariableGenes(counts, spikes) function. However, this dataset does not contain spike-ins so we will use the entire dataset to estimate the technical noise. In the figure below the red curve is the fitted technical noise model and the dashed line is the 95% CI. Pink dots are the genes with significant biological variability after multiple-testing correction. Brennecke_HVG <- BrenneckeGetVariableGenes( expr_matrix, fdr = 0.01, minBiolDisp = 0.5 ) This function returns a matrix of significant genes as well as their estimated effect size (difference between observed and expected coefficient of variation), and their significance as raw p.values and FDR corrected q.values. For now we will just keep the names of the significant HVG genes. HVG_genes <- Brennecke_HVG$Gene Exercise 3 How many genes were signifcant using BrenneckeGetVariableGenes? 7.8.1.2 High Dropout Genes An alternative to finding HVGs is to identify genes with unexpectedly high numbers of zeros. The frequency of zeros, known as the “dropout rateâ€, is very closely related to expression level in scRNASeq data. Zeros are the dominant feature of single-cell RNASeq data, typically accounting for over half of the entries in the final expression matrix. These zeros predominantly result from the failure of mRNAs failing to be reversed transcribed (Andrews and Hemberg, 2016). Reverse transcription is an enzyme reaction thus can be modelled using the Michaelis-Menten equation: \\[P_{dropout} = 1 - S/(K + S)\\] where \\(S\\) is the mRNA concentration in the cell (we will estimate this as average expression) and \\(K\\) is the Michaelis-Menten constant. Because the Michaelis-Menten equation is a convex non-linear function, genes which are differentially expression across two or more populations of cells in our dataset will be shifted up/right of the Michaelis-Menten model (see Figure below). K <- 49 S_sim <- 10^seq(from = -3, to = 4, by = 0.05) # range of expression values MM <- 1 - S_sim / (K + S_sim) plot( S_sim, MM, type = "l", lwd = 3, xlab = "Expression", ylab = "Dropout Rate", xlim = c(1,1000) ) S1 <- 10 # Mean expression in population 1 P1 <- 1 - S1 / (K + S1) # Dropouts for cells in condition 1 S2 <- 750 # Mean expression in population 2 P2 <- 1 - S2 / (K + S2) # Dropouts for cells in condition 2 points( c(S1, S2), c(P1, P2), pch = 16, col = "grey85", cex = 3 ) mix <- 0.5 # proportion of cells in condition 1 points( S1 * mix + S2 * (1 - mix), P1 * mix + P2 * (1 - mix), pch = 16, col = "grey35", cex = 3 ) Note: add log="x" to the plot call above to see how this looks on the log scale, which is used in M3Drop figures. Exercise 4: Produce the same plot as above with different expression levels (S1 & S2) and/or mixtures (mix). We use M3Drop to identify significant outliers to the right of the MM curve. We also apply 1% FDR multiple testing correction: M3Drop_genes <- M3DropFeatureSelection( expr_matrix, mt_method = "fdr", mt_threshold = 0.01 ) M3Drop_genes <- M3Drop_genes$Gene An alternative method is contained in the M3Drop package that is tailored specifically for UMI-tagged data which generally contains many zeros resulting from low sequencing coverage in addition to those resulting from insufficient reverse-transcription. This model is the Depth-Adjusted Negative Binomial (DANB). This method describes each expression observation as a negative binomial model with a mean related to both the mean expression of the respective gene and the sequencing depth of the respective cell, and a variance related to the mean-expression of the gene. This method is designed to model the raw counts in a dataset directly, and we can extract the appropriate matrix using the “NBumiConvertData†function similar to M3Drop. However, we have an extra step for fitting the model since that is the slowest step of the method and we are currently working on additional methods that can use this model information for other things (such as normalization, co-expression testing, highly variable gene detection). This method includes a binomial test of the significance of each feature, but since the Deng data is not UMI counts the model does not fit the noise sufficiently and far too many genes will be called as significant. Thus we will take the top 1500 by effect size. deng_int <- NBumiConvertData(deng) DANB_fit <- NBumiFitModel(deng_int) # DANB is fit to the raw count matrix # Perform DANB feature selection DropFS <- NBumiFeatureSelectionCombinedDrop(DANB_fit, method="fdr", qval.thresh=0.01, suppress.plot=FALSE) DANB_genes <- DropFS[1:1500,]$Gene Exercise 5 How many genes were signifcant using NBumiFeatureSelectionCombinedDrop? 7.8.2 Correlated Expression A completely different approach to feature selection is to use gene-gene correlations. This method is based on the idea that multiple genes will be differentially expressed between different cell-types or cell-states. Genes which are expressed in the same cell-population will be positively correlated with each other where as genes expressed in different cell-populations will be negatively correated with each other. Thus important genes can be identified by the magnitude of their correlation with other genes. The limitation of this method is that it assumes technical noise is random and independent for each cell, thus shouldn’t produce gene-gene correlations, but this assumption is violated by batch effects which are generally systematic between different experimental batches and will produce gene-gene correlations. As a result it is more appropriate to take the top few thousand genes as ranked by gene-gene correlation than consider the significance of the correlations. cor_feat <- M3Drop::corFS(expr_matrix) Cor_genes <- names(cor_feat)[1:1500] Lastly, another common method for feature selection in scRNASeq data is to use PCA loadings. Genes with high PCA loadings are likely to be highly variable and correlated with many other variable genes, thus may be relevant to the underlying biology. However, as with gene-gene correlations PCA loadings tend to be susceptible to detecting systematic variation due to batch effects; thus it is recommended to plot the PCA results to determine those components corresponding to the biological variation rather than batch effects. # PCA is typically performed on log-transformed expression data pca <- prcomp(log(expr_matrix + 1) / log(2)) # plot projection plot( pca$rotation[,1], pca$rotation[,2], pch = 16, col = cell_colors[as.factor(celltype_labs)] ) # calculate loadings for components 1 and 2 score <- rowSums(abs(pca$x[,c(1,2)])) names(score) <- rownames(expr_matrix) score <- score[order(-score)] PCA_genes <- names(score[1:1500]) Exercise 6 Consider the top 5 principal components. Which appear to be most biologically relevant? How does the top 1,500 features change if you consider the loadings for those components? 7.8.3 Comparing Methods We can check whether the identified features really do represent genes differentially expressed between cell-types in this dataset. M3DropExpressionHeatmap( M3Drop_genes, expr_matrix, cell_labels = celltype_labs ) We can also consider how consistent each feature selection method is with the others using the Jaccard Index: J <- sum(M3Drop_genes %in% HVG_genes)/length(unique(c(M3Drop_genes, HVG_genes))) Exercise 7 Plot the expression of the features for each of the other methods. Which appear to be differentially expressed? How consistent are the different methods for this dataset? 7.8.4 sessionInfo() References "], -["handling-sparsity.html", "8 Handling sparsity 8.1 Challenge: Handling sparsity in single-cell RNA sequencing 8.2 Status 8.3 Open problems", " 8 Handling sparsity The material below is reproduced from (Laehnemann et al. 2019): Laehnemann,D. et al. (2019) 12 Grand challenges in single-cell data science PeerJ Preprints. link 8.1 Challenge: Handling sparsity in single-cell RNA sequencing A comprehensive characterization of the transcriptional status of individual cells enables us to gain full insight into the interplay of transcripts within single cells. However, scRNA-seq measurements typically suffer from large fractions of observed zeros, where a given gene in a given cell has no unique molecule identifiers or reads mapping to it. These observed zero values can represent either missing data (i.e.~a gene is expressed but not detected by the sequencing technology) or true absence of expression. The proportion of zeros, or degree of sparsity, is thought to be due to imperfect reverse transcription and amplification, and other technical limitations (), and depends on the scRNA-seq platform used, the sequencing depth and the underlying expression level of the gene. The term ``dropout’’ is often used to denote observed zero values in scRNA-seq data, but this term conflates zero values attributable to methodological noise and biologically-true zero expression, so we recommend against its use as a catch-all term for observed zeros. Sparsity in scRNA-seq data can hinder downstream analyses, but it is challenging to model or handle it appropriately, and thus, there remains an ongoing need for improved methods. Sparsity pervades all aspects of scRNA-seq data analysis, but here we focus on the linked problems of learning latent spaces and imputing'' expression values from scRNA-seq data (\\autoref{fig:denoising-imputation}). Imputation,data smoothing’’ and ``data reconstruction’’ approaches are closely linked to the challenges of normalization. But whereas normalization generally aims to make expression values between cells more comparable to each other, imputation and data smoothing approaches aim to achieve adjusted data values that—it is hoped—better represent the true expression values. Imputation methods could therefore be used for normalization, but do not entail all possible or useful approaches to normalization. 8.2 Status The imputation of missing values has been very successful for genotype data. Crucially, when imputing genotypes we often know which data are missing (e.g.~when no genotype call is possible due to no coverage of a locus, although see section for the challenges with data) and rich sources of external information are available (e.g.~haplotype reference panels). Thus, genotype imputation is now highly accurate and a commonly-used step in data processing for genetic association studies . The situation is somewhat different for scRNA-seq data, as we do not routinely have external reference information to apply (see ). In addition, we can never be sure which observed zeros represent missing data'' and which accurately represent a true gene expression level in the cell \\citep{hicks_missing_2018}. Observed zeros can either representbiological’’ zeros, i.e.~those present because the true expression level of a gene in a cell was zero. Or they they are the result of methodological noise, which can arise when a gene has true non-zero expression in a cell, but no counts are observed due to failures at any point in the complicated process of processing mRNA transcripts in cells into mapped reads. Such noise can lead to artefactual zero that are either more systematic (e.g.~sequence-specific mRNA degradation during cell lysis) or that occur by chance (e.g.~barely expressed transcripts that at the same expression level will sometimes be detected and sometimes not, due to sampling variation, e.g~in the sequencing). The high degree of sparsity in scRNA-seq data therefore arises from technical zeros and true biological zeros, which are difficult to distinguish from one another. In general, two broad approaches can be applied to tackle this problem of sparsity: use statistical models that inherently model the sparsity, sampling variation and noise modes of scRNA-seq data with an appropriate data generative model; or attempt to ``impute’’ values for observed zeros (ideally the technical zeros; sometimes also non-zero values) that better approximate the true gene expression levels. We prefer to use the first option where possible, and for many single-cell data analysis problems, statistical models appropriate for sparse count data exist and should be used (e.g.~for differential expression analysis). However, there are many cases where the appropriate models are not available and accurate imputation of technical zeros would allow better results from downstream methods and algorithms that cannot handle sparse count data. For example, imputation could be particularly useful for many dimension reduction, visualization and clustering applications. It is therefore desirable to improve both statistical methods that work on sparse count data directly and approaches for data imputation for scRNA-seq data, whether by refining existing techniques or developing new ones (see also ). We define three broad (and sometimes overlapping) categories of methods that can be used to ``impute’’ scRNA-seq data in the absence of an external reference: __Model-based imputation methods of technical zeros_ use probabilistic models to identify which observed zeros represent technical rather than biological zeros and aim to impute expression levels just for these technical zeros, leaving other observed expression levels untouched; or __Data-smoothing methods_ define sets of ``similar’’ cells (e.g.~cells that are neighbors in a graph or occupy a small region in a latent space) and adjust expression values for each cell based on expression values in similar cells. These methods adjust all expression values, including technical zeros, biological zeros and observed non-zero values. __Data-reconstruction methods_ typically aim to define a latent space representation of the cells. This is often done through matrix factorization (e.g.~principal component analysis) or, increasingly, through machine learning approaches (e.g.~variational autoencoders that exploit deep neural networks to capture non-linear relationships). Although a broad class of methods, both matrix factorization methods and autoencoders (among others) are able to reconstruct'' the observed data matrix from low-rank or simplified representations. The reconstructed data matrix will typically no longer be sparse (with many zeros) and the implicitlyimputed’’ data can be used for downstream applications that cannot handle sparse count data. The first category of methods generally seeks to infer a probabilistic model that captures the data generation mechanism. Such generative models can be used to identify, probabilistically, which observed zeros correspond to technical zeros (to be imputed) and which correspond to biological zeros (to be left alone). There are many model-based imputation methods already available that use ideas from clustering (e.g.~k-means), dimension reduction, regression and other techniques to impute technical zeros, oftentimes combining ideas from several of these approaches. These include SAVER , ScImpute , bayNorm , scRecover , and VIPER . Clustering methods that implicitly impute values, such as CIDR and BISCUIT , are closely related to this class of imputation methods. Data-smoothing methods, which adjust all gene expression levels based on expression levels in similar'' cells, have also been proposed to handle imputation problems. We might regard these approaches asdenoising’’ methods. To take a simplified example (), we might imagine that single cells originally refer to points in two-dimensional space, but are likely to describe a one-dimensional curve; projecting data points onto that curve eventually allows imputation of the ``missing’’ values (but all points are adjusted, or smoothed, not just true technical zeros). Prominent data-smoothing approaches to handling sparse counts include: diffusion-based MAGIC k-nearest neighbor-based knn-smooth network diffusion-based netSmooth clustering-based DrImpute locality sensitive imputation in LSImpute A major task in the analysis of high-dimensional single-cell data is to find low-dimensional representations of the data that capture the salient biological signals and render the data more interpretable and amenable to further analyses. As it happens, the matrix factorization and latent-space learning methods used for that task also provide another route for imputation through their ability to the observed data matrix from simplified representations of it. is one such standard matrix factorization method that can be applied to scRNA-seq data (preferably after suitable data normalization) as are other widely-used general statistical methods like and . As (linear) matrix factorization methods, , and decompose the observed data matrix into a ``small’’ number of factors in two low-rank matrices, one representing cell-by-factor weights and one gene-by-factor loadings. Many matrix factorization methods with tweaks for single-cell data have been proposed in recent years, including: ZIFA, a zero-inflated factor analysis f-scLVM, a sparse Bayesian latent variable model GPLVM, a Gaussian process latent variable model ZINB-WaVE, a zero-inflated negative binomial factor model scCoGAPS, an extension of consensus , a meta-analysis approach to pCMF, probabilistic count matrix factorization with a Poisson model SDA, sparse decomposition of arrays; another sparse Bayesian method . Some data reconstruction approaches have been specifically proposed for imputation, including: ENHANCE, denoising with an aggregation step ALRA, SVD with adaptive thresholding scRMD, robust matrix decomposition Recently, machine learning methods have emerged that apply autoencoders and deep neural networks ) or ensemble learning ) to impute expression values. Additionally, many deep learning methods have been proposed for single-cell data analysis that can, but need not, use probabilistic data generative processes to capture low-dimensional or latent space representations of a dataset. Even if imputation is not a main focus, such methods can generate ``imputed’’ expression values as an upshot of a model primarily focused on other tasks like learning latent spaces, clustering, batch correction, or visualization (and often several of these tasks simultaneously). The latter set includes tools such as: DCA, an autoencoder with a zero-inflated negative binomial distribution scVI, a variational autoencoder with a zero-inflated negative binomial model LATE VASC compscVAE scScope Tybalt SAUCIE scvis net-SNE BERMUDA, focused on batch correction DUSC Expression Saliency others Besides the three categories described above, a small number of scRNA-seq imputation methods have been developed to incorporate information external to the current dataset for imputation. These include: ADImpute , which uses gene regulatory network information from external sources; SAVER-X , a transfer learning method for denoising and imputation that can use information from atlas-type resources; and methods that borrow information from matched bulk RNA-seq data like URSM and SCRABBLE . 8.3 Open problems A major challenge in this context is the circularity that arises when imputation solely relies on information that is internal to the imputed dataset. This circularity can artificially amplify the signal contained in the data, leading to inflated correlations between genes and/or cells. In turn, this can introduce false positives in downstream analyses such as differential expression testing and gene network inference . Handling batch effects and potential confounders requires further work to ensure that imputation methods do not mistake unwanted variation from technical sources for biological signal. In a similar vein, single-cell experiments are affected by various uncertainties (see ). Approaches that allow quantification and propagation of the uncertainties associated with expression measurements (), may help to avoid problems associated with ‘overimputation’ and the introduction of spurious signals noted by . To avoid this circularity, it is important to identify reliable external sources of information that can inform the imputation process. One possibility is to exploit external reference panels (like in the context of genetic association studies). Such panels are not generally available for scRNA-seq data, but ongoing efforts to develop large scale cell atlases could provide a valuable resource for this purpose. Systematic integration of known biological network structures is desirable and may also help to avoid circularity. A possible approach is to encode network structure knowledge as prior information, as attempted in netSmooth and ADImpute. Another alternative solution is to explore complementary types of data that can inform scRNA-seq imputation. This idea was adopted in SCRABBLE and URSM, where an external reference is defined by bulk expression measurements from the same population of cells for which imputation is performed. Yet another possibility could be to incorporate orthogonal information provided by different types of molecular measurements (see ). Methods designed to integrate multi-omics data could then be extended to enable scRNA-seq imputation, e.g.~through generative models that explicitly link scRNA-seq with other data types or by inferring a shared low-dimensional latent structure that could be used within a data-reconstruction framework. With the proliferation of alternative methods, comprehensive benchmarking is urgently required as for all areas of single-cell data analysis . Early attempts by and provide valuable insights into the performance of methods available at the time. But many more methods have since been proposed and even more comprehensive benchmarking platforms are needed. Many methods, especially those using deep learning, depend strongly on choice of hyperparameters . There, more detailed comparisons that explore parameter spaces would be helpful, extending work like that from comparing dimensionality reduction methods. Learning from exemplary benchmarking studies , it would be immensely beneficial to develop a community-supported benchmarking platform with a wide-range of synthetic and experiment ground-truth datasets (or as close as possible, in the case of experimental data) and a variety of thoughtful metrics for evaluating performance. Ideally, such a benchmarking platform would remain dynamic beyond an initial publication to allow ongoing comparison of methods as new approaches are proposed. Detailed benchmarking would also help to establish when normalization methods derived from explicit count models may be preferable to imputation. Finally, scalability for large numbers of cells remains an ongoing concern for imputation, data smoothing and data reconstruction methods, as for all high-throughput single-cell methods and software (see ). library(scater) ## Loading required package: SingleCellExperiment ## Loading required package: SummarizedExperiment ## Loading required package: GenomicRanges ## Loading required package: stats4 ## Loading required package: BiocGenerics ## Loading required package: parallel ## ## Attaching package: 'BiocGenerics' ## The following objects are masked from 'package:parallel': ## ## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, ## clusterExport, clusterMap, parApply, parCapply, parLapply, ## parLapplyLB, parRapply, parSapply, parSapplyLB ## The following objects are masked from 'package:stats': ## ## IQR, mad, sd, var, xtabs ## The following objects are masked from 'package:base': ## ## anyDuplicated, append, as.data.frame, basename, cbind, ## colnames, dirname, do.call, duplicated, eval, evalq, Filter, ## Find, get, grep, grepl, intersect, is.unsorted, lapply, Map, ## mapply, match, mget, order, paste, pmax, pmax.int, pmin, ## pmin.int, Position, rank, rbind, Reduce, rownames, sapply, ## setdiff, sort, table, tapply, union, unique, unsplit, which, ## which.max, which.min ## Loading required package: S4Vectors ## ## Attaching package: 'S4Vectors' ## The following object is masked from 'package:base': ## ## expand.grid ## Loading required package: IRanges ## Loading required package: GenomeInfoDb ## Loading required package: Biobase ## Welcome to Bioconductor ## ## Vignettes contain introductory material; view with ## 'browseVignettes()'. To cite Bioconductor, see ## 'citation("Biobase")', and for packages 'citation("pkgname")'. ## Loading required package: DelayedArray ## Loading required package: matrixStats ## ## Attaching package: 'matrixStats' ## The following objects are masked from 'package:Biobase': ## ## anyMissing, rowMedians ## Loading required package: BiocParallel ## ## Attaching package: 'DelayedArray' ## The following objects are masked from 'package:matrixStats': ## ## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges ## The following objects are masked from 'package:base': ## ## aperm, apply, rowsum ## Loading required package: ggplot2 ## ## Attaching package: 'scater' ## The following object is masked from 'package:S4Vectors': ## ## rename ## The following object is masked from 'package:stats': ## ## filter library(SingleCellExperiment) library(glmpca) library(ggplot2) library(Polychrome) References "], -["latent-spaces.html", "9 Latent spaces 9.1 Dimensionality reduction 9.2 Matrix factorization and factor analysis 9.3 Autoencoders 9.4 Interpretable latent spaces", " 9 Latent spaces In many cases we may like to think of cells sitting in a low-dimensional, “latent†space that captures relationships between cells more intuitively than the very high-dimensional gene expression space. 9.1 Dimensionality reduction Why? - Reduce Curse of Dimensionality problems - Increase storage and computational efficiency - Visualize Data in 2D or 3D Difficulty: Need to decide how many dimension to keep. 9.1.1 PCA: Principal component analysis 9.1.1.1 (traditional) PCA PCA is a linear feature extraction technique. It performs a linear mapping of the data to a lower-dimensional space in such a way that the variance of the data in the low-dimensional representation is maximized. It does so by calculating the eigenvectors from the covariance matrix. The eigenvectors that correspond to the largest eigenvalues (the principal components) are used to reconstruct a significant fraction of the variance of the original data. In simpler terms, PCA combines your input features in a specific way that you can drop the least important feature while still retaining the most valuable parts of all of the features. As an added benefit, each of the new features or components created after PCA are all independent of one another. 9.1.1.1.1 Basic ideas of PCA Idea1: Dropping dimensions = Projection onto lower dimensional space   Which dimension should we keep? Idea2: more variantion = more information   But what if the plot is not readily to be projected onto either X or Y-axis? 9.1.1.1.2 Steps of PCA Step1: Rotation We want a set of axises (called Principle Components) that satisfies: -The 1st axis points to the direction where variantion is maximized, and so on -They are orthogonal to each other It can be shown that the eigen vectors of the covariance matrix satisfy these conditions, and the eigen vector according to the largest eigen value accounts for the most variation. Step2: Projection (3-dimesion \\(\\rightarrow\\) 2-dimension) 9.1.1.1.3 An example of PCA deng <- readRDS("data/deng/deng-reads.rds") my_color1 <- createPalette(6, c("#010101", "#ff0000"), M=1000) names(my_color1) <- unique(as.character(deng$cell_type1)) my_color2 <- createPalette(10, c("#010101", "#ff0000"), M=1000) names(my_color2) <- unique(as.character(deng$cell_type2)) deng <- runPCA(deng, ncomponents = 2) plotPCA(deng, colour_by = "cell_type1") + scale_fill_manual(values = my_color1) ## Scale for 'fill' is already present. Adding another scale for 'fill', ## which will replace the existing scale. plotPCA(deng, colour_by = "cell_type2") + scale_fill_manual(values = my_color2) ## Scale for 'fill' is already present. Adding another scale for 'fill', ## which will replace the existing scale. 9.1.1.1.4 Advantages and limits of PCA: Advantages: fast, easy to use and intuitive. Limits: Can lead to local inconsistency, i.e. far away points can become nearest neighbours. It is a linear projection, like casting a shadow, meaning it can’t capture non-linear dependencies. For instance, PCA would not be able to “unroll†the following structure. 9.1.1.2 GLM-PCA GLM-PCA is a generalized version of the traditional PCA. The traditional PCA implicitly imposes an assumption of Gaussian distribution. The purpose of GLM-PCA is to loosen this condition to accommodate other distributions of the exponential family. Why does PCA assume a Gaussian distribution? Let \\(x_1, \\dots, x_n \\in \\mathcal{R}^d\\) be the \\(d\\)-dimensional data observed. PCA is looking for their projections onto a subspace: \\(u_1, \\dots, u_n\\), such that \\(\\sum_{i = 1}^n \\Vert x_i - u_i\\Vert^2\\) is minimized. This objective function can be interpretated in two ways: Interpretation 1: the variance of the projections/principal components: \\(\\sum_{i} \\Vert u_i \\Vert ^2\\), if the data is centered at the origin (\\(\\sum_{i} x_i = 0\\)); Interpretation 2: Each point \\(x_i\\) is thought of as a random draw from a probability distribution centered at \\(u_i\\). If we take this probability as a unit Gaussian, that is \\(x_i \\sim N(u_i, 1)\\), then the likelihood is \\(\\prod_{i = 1}^n \\exp (- \\Vert x_i - u_i\\Vert^2)\\), and the negative log likelihood is exactly the objective function. This assumption is often inappropriate for non-Gaussian distributed data, for example discrete data. Therefore, GLM-PCA generalizes the Gaussian likelihood into a likelihood of any exponential-family distribution, and applies appropriate link functions to \\(u_i\\)’s in the same as a GLM does to non-Gaussian responses. The following example compares GLM-PCA with Poisson marginals to the traditional PCA, which is identical to the result from plotPCA. ## GLM-PCA Y <- assay(deng, "counts") Y <- Y[rowSums(Y) > 0, ] system.time(res1 <- glmpca(Y, L=2, fam="poi", verbose=TRUE)) ## user system elapsed ## 82.313 22.987 105.317 pd1 <- data.frame(res1$factors, dimreduce="glmpca-poisson", clust = factor(deng$cell_type2)) ## traditional PCA pd2 <- data.frame(reducedDim(deng, "PCA"), dimreduce="runPCA", clust = factor(deng$cell_type2)) colnames(pd2) <- colnames(pd1) ## plot pd <- rbind(pd1, pd2) ggplot(pd, aes(x = dim1, y = dim2, colour = clust)) + geom_point(size=2) + facet_wrap(~dimreduce, scales="free", nrow=3) + scale_color_manual(values = my_color2) + theme_bw() Let us compare GLM-PCA and standard PCA (using normalized log-counts data) on the Tung data, before cells have been QC’d. ## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its ## own size factors ## Warning in .get_all_sf_sets(object): spike-in set 'MT' should have its own ## size factors Repeat these plots with the QC’d Tung data. ## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its ## own size factors ## Warning in .get_all_sf_sets(object): spike-in set 'MT' should have its own ## size factors 9.1.2 tSNE: t-Distributed Stochastic Neighbor Embedding t-SNE is an advanced version of the original SNE algorithm. [ref] 9.1.2.1 Motivation The weakness of PCA is the motivation behind the SNE algorithm. PCA focuses on global covariance structrue, which lead to local inconsistency. SNE aims to preserve local strucutrue, or preserving the relationships among data points (ie. similar points remain similar; distinct points remain distinct). Unlike PCA, SNE is not limited to linear projections, which makes it suited to all sorts of datasets, including the swiss-roll data we have seen above. t-SNE solves the crowding issue of the original SNE. 9.1.2.2 original SNE SNE minimizes the divergence between two distributions: a distribution that measures pairwise similarities of the input objects and a distribution that measures pairwise similarities of the corresponding low-dimensional points in the embedding. Goal: preserve neighbourhoods. Soft neighbourhood: For each data point \\(x_i\\), the \\(i\\rightarrow j\\) probability is the probability that point \\(x_i\\) chooses \\(x_j\\) as its neighbour: \\(p_{j|i} \\propto \\exp(-\\Vert x_i - x_j \\Vert^2/2\\delta^2)\\). (This can be thought of as the probability of \\(x_j\\) in \\(N(x_i, \\delta)\\)) \\(\\Vert x_i - x_j \\Vert^2\\) is the Euclidean distance: The closer \\(x_i\\) and \\(x_j\\) are, the larger \\(p_{j|i}\\) is. \\(\\delta^2\\) denotes the vairance, it sets the size of the neighbourhood.  Very low \\(\\Rightarrow\\) all the probability is in the nearest neighbour  Very high \\(\\Rightarrow\\) uniform weights We generally want \\(\\delta^2\\) to be small for points in densely populated areas and large for sparse areas, so that the number of neighbours of all data points are roughly the same. It is computed with a user specified parameter (perplexity) which indicates the effective number of neighbours for a data point. Similarity matrix Collect \\(p_{j|i}\\) for all data points into a matrix, then this matrix preserves the key information of the local neighbourhood structure. How SNE works: Given high-dimensional data \\(X = \\{x_1, \\dots, x_n \\in \\mathcal{R}^d \\}\\), obtain the similarity matrix \\(P\\); Let \\(Y =\\{y_1, \\dots, y_n \\in \\mathcal{R}^2\\}\\) be a 2-dimensional data, the coordinates for visualization. Obtain a smilarity matrix of \\(Y\\), denoted as \\(Q\\), in the same way as \\(X\\), except that \\(\\delta^2\\) is fixed at 1/2. Look for \\(Y\\) such that \\(Q\\) is as similar to \\(P\\) as possible. Measurement of how similar two distributions is: Kullback-Leibler divergence (The definition of this cost function and the optimization procedure are out of the scope of this course) 9.1.2.3 t-SNE The motivation of t-SNE is to solve one of the main issues of SNE, the crowding problem. Crowding problem: In high dimension we have more room, points can have a lot of different neighbours that are far apart from each other. But in low dimensions, we don’t have enough room to accommodate all neighbours. Forexample, in 2D a point can have a few neighbors at distance one all far from each other - what happens when we embed in 1D? Solution: Change the distribution of the low-dimensional data \\(Q\\) into a student-t distribution. Recall that SNE is trying to minimize the dissimilarity of \\(P\\) and \\(Q\\), and \\(P\\) has a Gaussian distribution. So for a pair of points (\\(x_i\\) and \\(x_j\\) in high-dimension, \\(y_i\\) and \\(y_j\\) in low-dimension) to reach the same probability, the distance between \\(y_i\\) and \\(y_j\\) would be much larger (i.e. much farther apart). 9.1.2.4 Example of t-SNE: muraro <- readRDS("data/pancreas/muraro.rds") tmp <- runTSNE(muraro, perplexity = 3) plotTSNE(tmp, colour_by = "cell_type1") tmp <- runTSNE(muraro, perplexity = 50) plotTSNE(tmp, colour_by = "cell_type1") 9.1.2.5 Limits of t-SNE: Not a convex problem, i.e. the cost function has multiple local minima. Non-deterministic. Require specification of very important parameters, e.g. perplexity. Coordinates after embedding have no meaning. Therefore can merely be used for visualization. (See here for more pitfalls of using t-SNE.) 9.1.3 Manifold methods 9.1.3.1 UMAP: Uniform Manifold Approximation and Projection 9.1.3.1.1 Advantages of UMAP over t-SNE: faster deterministic better at preserving clusters 9.1.3.1.2 High level description Construct a topological presentation of the high-dimensional data (in this case a weighted \\(k\\)-NN graph) Given a low-dimensional data, construct a graph in the similar way Minimize the dissimilarity between the twp graphs. (Look for the low-dimensional data whose graph is the closest to that of the high-dimensional data) 9.1.3.1.3 Some details How the weighted graph is built? Obtain dissimilarity from the input distance: For each data point \\(x_i\\), find its \\(k\\) nearest neighbours: \\(x_{i_1}, \\dots, x_{i_k}\\). Let \\(d(x_i, x_{i_j})\\) be the input or original distance between \\(x_i\\) and \\(x_{i_j}\\), and \\(\\rho_i = \\min[d(x_i, x_{i_j}); 1 \\leq j \\leq k]\\) be the distance between \\(x_i\\) and its nearest neighbour. Then the dissimilarity between \\(x_i\\) and \\(x_{i_j}\\) is measured simply by subtracting the original distance by \\(\\rho_i\\): \\(\\tilde{d}(x_i, x_{i_j}) = d(x_i, x_{i_j}) - \\rho_i\\). Tranform dissimilarity to similarity: \\(s(x_i, x_{i_j}) = \\exp[-\\tilde{d}(x_i, x_{i_j})] - c_i\\), where \\(c_i\\) is a scale factor to ensure \\(\\sum_{j = 1}^k s(x_i, x_{i_j})\\) is a constant for all \\(i\\). Similarity itself can serve as edge weights, but this similarity is not symmetrical, i.e. \\(s(x_i, x_{i_j}) \\neq s(x_{i_j}, x_i)\\). To be able to project this onto an undirected graph, we need to solve the disagreement between \\(s(x_i, x_{i_j})\\) and \\(s(x_{i_j}, x_i)\\). Obtain weights: \\(w(x_i, x_{i_j}) = s(x_i, x_{i_j}) + s(x_{i_j}, x_i) - s(x_i, x_{i_j}) * s(x_{i_j}, x_i)\\) (Interpretation: \\(P(A \\cup B ) = P(A) + P(B) - P(A)P(B)\\) if \\(A\\) and \\(B\\) are independent) How the dissimilarity between graphs are measured? Cross entropy 9.1.3.1.4 Example of UMAP muraro <- runUMAP(muraro) plotUMAP(muraro, colour_by="cell_type1") 9.1.3.2 PHATE 9.1.3.2.1 Sketch of algorithm The simpliest description of PHATE: Step1. Create a dissimilarity matrix of the original data Step2. Feed the dissimilarity matrix to nonmetric MDS (MDS: Multi-Dimension Scaling is a classical dimensionality reduction approach, that takes an input of distance matrix, and aims at preserving pairwise distances in the low dimensional space. When the input distance matrix is Euclidean distance, MDS produces the same result as PCA. Nonmetric MDS generalize the input as a dissimilarity matrix, rather than just distance.) Details of step1 in PHATE Step1-1. Markov transition matrix - What is similar with SNE: Recall that in the original SNE algorithm, there is a similarity matrix with entry \\(p_{i|j}\\) that is interpreted as the probability that point \\(x_i\\) chooses \\(x_j\\) as its neighbour: \\(p_{j|i} \\propto \\exp(-\\Vert x_i - x_j \\Vert^2/2\\delta^2)\\). PHATE is doing the same, except that we can interpret it differently: i. We can think \\(p_{j|i}\\) as a Gaussian kernel, where \\(\\epsilon \\triangleq 2\\delta^2\\) is the bandwidth: \\(p_{j|i} \\triangleq K_\\epsilon(x_i, x_j )\\). Similar to SNE, PHATE also define \\(\\epsilon\\) as the \\(k\\)-NN distance of each data point, so that it is smaller in dense area and larger in sparse area. The \\(k\\) is a user-specified tuning parameter, similar to perplexity in SNE. ii. We can think of the similarity matrix as a transition matrix, where \\(p_{j|i}\\) represents the probability of jumping from state \\(i\\) to state \\(j\\) in a single step. - What is different: i. PHATE generalize \\(K_\\epsilon(x_i, x_j)\\) to \\(\\exp \\left(- \\Vert x_i - x_j \\Vert^\\alpha /\\epsilon(x_i)^\\alpha\\right)\\), where the original Gaussian kernel is the special case when \\(\\alpha = 2\\). The motivation is that if the data is very sparse in some regions, then the bandwith \\(\\epsilon\\) with be very large and the kernel will become flat and lose the local information. By letting \\(\\alpha > 2\\), we prevent this to happen, although \\(\\alpha\\) needs to be provided by the user.  ii. Note that the kernels are not symmetrical now, that is \\(K_\\epsilon(x_i, x_j) \\neq K_\\epsilon(x_j, x_i)\\). So we make it symmetrical by taking an average of the two.  Step1-2. Smoothing - \\(P\\) is the transition matrix where \\(p_{i, j}\\) represents the probability of jumping from state \\(i\\) to state \\(j\\) in a single step. - Denote \\(\\delta_x\\) as a row vector of length \\(n\\) (the number of data points), where only the entry corresponding to \\(x\\) is 1 and zero everywhere else. Then \\(p_x = \\delta_x P\\) is the probability distribution of the data points starting from \\(x\\) after one step, and \\(p_x^t = \\delta_x P^t\\) is the probability distribution of the data points after \\(t\\) steps. In general, the more steps we take, the more data points will have positive probabilities. One way to think about this, is the larger \\(t\\) is, the more global information and the less local information is encoded. In the extreme case, if we take infinity steps, \\(p_x^\\infty\\) will be the same for all \\(x\\)’s, i.e. the probability distribution is going to be the same regardless of where we start, in this case, the local information is completely lost. An appropriately chosen \\(t\\) is crucial for the balance between local and global information in the embedding. (See the original paper for details of choosing \\(t\\)) Step1-3. Distance measurement Instead of measuring directly the Eudlicean distance between data points, say \\(x_i\\) and \\(x_j\\), PHATE measures the distance between probability distributions \\(p_{x_i}^t\\) and \\(p_{x_j}^t\\): \\(D^t(x_i, x_j) = \\Vert \\log(p_{x_i}^t) - \\log(p_{x_j}^t) \\Vert^2\\) 9.1.3.2.2 Example of PHATE 9.2 Matrix factorization and factor analysis Factor Analysis is similar to PCA in that, they both aim to obtain a new set of distinct summary variables, which are fewer in number than the original number of variables. The key concept of factor analysis is that the original, observed variables are correlated because they are all associated with some unobservable variables, called latent factors. The variance of a variable can be splitted into two parts: - Common variance: the part of variance that is explained by latent factors; - Unique variance: the part that is specific to only one variable, usually considered as an error component or residual. 9.3 Autoencoders 9.3.1 Background and some notations Data: \\(X\\) Latent variables: \\(Z\\) Something that is not directly observable but is assumed to have an impact on the observed variables. Goal: We believe \\(X\\) can be generated from \\(Z\\) (with some trasformation), and want to sample more data from \\(Z\\) that resembles \\(X\\). So we want the to find the parameters \\(\\theta\\) such that the probability to generate \\(X\\) from the distribution of \\(Z\\): \\(P(X) = \\int P(X|z; \\theta) P(z) dz\\) is maximized. How do we define \\(Z\\)? -The simpliest idea: \\(Z \\sim N(0, 1)\\). It is not impossible, because “any distribution in d dimensions can be generated by taking a set of d variables that are normally distributed and mapping them through a sufficiently complicated function.†-A better idea: For most of \\(z\\), \\(P(X|z; \\theta)\\) will be close to zero, meaning it contribute almost nothing to the estimate of \\(P(X)\\). Thus, we want to sample only those values of \\(Z\\) that are likely to produce \\(X\\). Denote this distribution of \\(Z\\) as \\(Q(Z|X)\\) (it is infered and therefore depend on \\(X\\)). Advantage: There will be a lot less possible values of \\(Z\\) under \\(Q\\) compared to random sampling, therefore, it will be easier to compute \\(E_{Z \\sim Q} P(X|Z)\\). 9.3.2 Objective \\[ \\log P(X) - KL[Q(Z|X)\\Vert P(Z|X)] = E_{Z\\sim Q}[\\log P(X|Z)] - KL[Q(Z|X)\\Vert P(Z)]\\] We can get this equation by starting from the definition of Kullback-Leibler divergence, combined with the Bayesian formula and a little algebra. (Not showing details here) LHS: what we want to maximize: Generation loss: how likely the generated samples resembles \\(X\\) - an error term which measures how much information is lost when using \\(Q\\) to represent \\(P\\), it becomes small if \\(Q\\) is high-capacity. (A loose explanation of model capacity: Roughly speaking, the capacity of a model describes how complex a relationship it can model. You could expect a model with higher capacity to be able to model more relationships between more variables than a model with a lower capacity.) RHS: what we can maximize through stochastic gradient descent. 9.4 Interpretable latent spaces 9.4.1 Slalom "], -["clustering-and-cell-annotation.html", "10 Clustering and cell annotation 10.1 Clustering Methods 10.2 Clustering example 10.3 An alternative to clustering: Automatic cell annotation", " 10 Clustering and cell annotation 10.1 Clustering Methods Once we have normalized the data and removed confounders we can carry out analyses that are relevant to the biological questions at hand. The exact nature of the analysis depends on the dataset. Nevertheless, there are a few aspects that are useful in a wide range of contexts and we will be discussing some of them in the next few chapters. We will start with the clustering of scRNA-seq data. 10.1.1 Introduction One of the most promising applications of scRNA-seq is de novo discovery and annotation of cell-types based on transcription profiles. Computationally, this is a hard problem as it amounts to unsupervised clustering. That is, we need to identify groups of cells based on the similarities of the transcriptomes without any prior knowledge of the labels. Moreover, in most situations we do not even know the number of clusters a priori. The problem is made even more challenging due to the high level of noise (both technical and biological) and the large number of dimensions (i.e. genes). When working with large datasets, it can often be beneficial to apply some sort of dimensionality reduction method. By projecting the data onto a lower-dimensional sub-space, one is often able to significantly reduce the amount of noise. An additional benefit is that it is typically much easier to visualize the data in a 2 or 3-dimensional subspace. We have already discussed PCA (chapter 6.6.2) and t-SNE (chapter 6.6.2). Challenges in clustering What is the number of clusters k? What defines a good clustering? What is a cell type? Scalability: in the last few years the number of cells in scRNA-seq experiments has grown by several orders of magnitude from ~\\(10^2\\) to ~\\(10^6\\) 10.1.2 unsupervised Clustering methods Three main ingredients of a complete clustering method: Measure of similarity: how do we quantify how close two data points are? Quality function: how do we decide how “good†is a clustering/partition? Algorithm: how to find the clustering whose quality function is optimized? 10.1.2.1 Hierarchical clustering Hierarchical clustering is basically the only type of clustering algorithm that does not seek to optimize a quality function, because it builds a hierarchy of clusters, instead of one single clustering result as the output. There are two types of strategies: - Agglomerative (bottom-up): each observation starts in its own cluster, and pairs of clusters are merged as one moves up the hierarchy. - Divisive (top-down): all observations start in one cluster, and splits are performed recursively as one moves down the hierarchy.  10.1.2.2 k-means clustering Measure of similarity: Euclidean distance Quality function: Within cluster distance Algorithm: Advantage: Fast Drawbacks: - Sensitive to initial clustering - Sensitive to outliers - Need to specify K - Tend to find clusters of similar sizes Tools related to K-means: SC3 10.1.2.3 Graph-based methods Real world networks usualy display big inhomogeneities or community structure. Communities or clusters or modules are groups of vertices which probably share common properties and/or play similar roles whithin the graph. In recent years there has been a lot of interest in detecting communities in networks in various domains. Some of these community detection methods can be applied to scRNA-seq data by building a graph where each vertice represents a cell and (weight of) the edge measures similarity between two cells. Actually, graph-based clustering is the most popular clustering algorithm in scRNA-seq data analysis, and has been reported to have outperformed other clustering methods in many situations (ref). 10.1.2.3.1 Why do we want to represent the data as a graph? Memory effectiveness: A (complete) graph can be thought as an alternative expression of similarity matrix. Current methods (discuss later) aim to build sparse graphs, which ease the memory burden. Curse of dimensionality: All data become sparse in high-dimensional space and therefore similarities measured by Euclidean distances etc are generally low between all objects. 10.1.2.3.2 Building a graph Step1: Build an unweighted K-nearest neighbour (KNN) graph Step2: Add weights, and obtain a shared nearest neighbour (SNN) graph {width= 4%} There are two ways of adding weights: number and rank. - number: The number of shared nodes between \\(u\\) and \\(v\\), in this case, 3. - rank: A measurement of the closeness to their common nearest neighbours. (ref) Details of rank : Main idea: The closeness of two people is defined by their closest common friend. For each node, say \\(u\\), we can rank its 5 neighbours according to their closeness to \\(u\\), and we can do the same with \\(v\\). Denote the three shared neighbours as \\(x_1\\), \\(x_2\\) and \\(x_3\\), so rank(\\(x_1, u\\)) = 1 means \\(x_1\\) is the closest neighbour of \\(u\\). The idea is, if \\(x_1\\) is also the closest to \\(v\\), then \\(u\\) and \\(v\\) should have a larger similarity, or weight. So we summarize the overall closeness of \\(x_1\\) with both \\(u\\) and \\(v\\) by taking an average: \\(\\dfrac{1}{2}(\\text{rank}(x_1, u), \\text{rank}(x_1, v))\\). Then we find the one with the largest closeness, \\(s(u, v) = \\min \\left[ \\dfrac{1}{2}(\\text{rank}(x_i, u), \\text{rank}(x_i, v)) \\vert i = 1, 2, 3\\right]\\). The final expression of weight: \\[ w(u, v) = K - s(u, v).\\] 10.1.2.3.3 Quality function (Modularity) Modularity is not the only quality function for graph-based clustering, but it is one of the first attempts to embed in a compact form many questions including … . The idea of modularity: A random graph should not have a cluster structure. The more “quality†a partition has compared to a random graph, the “better†the partition is. Specifically, it is defined by: the quality of a partition on the actual graph \\(-\\) the quality of the same partition on a random graph quality : Sum of the weights within clusters random graph : a copy of the original graph, with some of its properties, but without community structure. The random graph defined by modularity is: each node has the same degree as the original graph. \\[ Q \\propto \\sum_{i, j} A_{i, j} \\delta(i, j) - \\sum_{i, j} \\dfrac{k_i k_j}{2m} \\delta(i, j)\\] [notations] Higher modularity implies better partition: Limits of modularity: 1. Resolution limit. Short version: Modularity maximization forces small communities into larger ones. Longer version: For two clusters \\(A\\) and \\(B\\), if \\(k_A k_B < 2m\\) then modularity increases by merging A and B into a single cluster, even if A and B are distinct clusters. 2. Bad, even random partitions may have a high modularity. Networks lack a clear modularity maxima. 10.1.2.3.4 Algorithms : Modularity-based clustering methods implemented in single cell analysis are mostly greedy algorithms, that are very fast, although not the most accurate approaches.   Louvain:   Leiden: Improved Louvain, hybrid of greedy algorithm and sampling technique 10.1.2.3.5 Advantages: -Fast -No need to specify \\(k\\) 10.1.2.3.6 Tools for graph-based clustering: Seurat: Louvain, Leiden, SLM igraph: fast greedy, Louvain, optimal, walktrap, spinglass, infomap 10.1.2.4 Concensus clustering (more robustness, less computational speed) 10.1.2.4.1 Motivation (Two problems of \\(K\\)-means): Problem1: sensitive to initial partitions  Solution:  Run multiple iterations of \\(K\\)-means on different subsamples of the original dataset, with different initail partitions. Problem2: the selection of \\(K\\).  Solution:  Run \\(K\\)-means with a range of \\(K\\)’s. 10.1.2.4.2 Algorithm of concensus clustering (simpliest version): for(k in the range of K){ for(each subsample of the data){ for(iteration in 1:1000){ kmeans(subsample, k) # each iteration means a different initial partition save partition } } return consensus clustering result of k } 10.1.2.4.3 Subsample obtained by dimensional reduction: steps of PCA: i) tranformation of the similarity matrix. ii) ranking eigen vectors according to their accoring eigen values in decreasing order. iii) need to decide how many (\\(d\\)) PC’s or eigenvalues we wants to reduce to. In SC3, i) considers two types of transformation: the one with traditional PCA and the associated graph Laplacian. iii) User may specify a range of \\(d\\), or use the default range suggested by the authors according to their experience with empirical results. 10.1.2.4.4 Consensus clustering (combining multiple clustering results): Step1: Represent each partition as a matrix: Say we partitioned four data points into 2 clusters. Step2: Concensus matrix: Average of all the partitions 10.1.2.4.5 Tools for consensus clustering: SC3 10.2 Clustering example library(pcaMethods) library(SC3) library(scater) library(SingleCellExperiment) library(pheatmap) library(mclust) library(igraph) library(scran) 10.2.1 Example 1. Graph-based clustering (deng dataset) To illustrate clustering of scRNA-seq data, we consider the Deng dataset of cells from developing mouse embryo (Deng et al. 2014). We have preprocessed the dataset and created a SingleCellExperiment object in advance. We have also annotated the cells with the cell types identified in the original publication (it is the cell_type2 column in the colData slot). deng <- readRDS("data/deng/deng-reads.rds") First, we build a \\(K\\)-NN graph with a package function from scran. The most important decision of building a graph is the choice of \\(K\\), of which there is no standard rule. In general, we can think of it as an indication of the desired cluster size. If \\(K\\) is too small, a genuine cluster might be split into parts, while if \\(K\\) is too large, clusters might not thoroughly separated. deng5 <- buildSNNGraph(deng, k = 5) deng15 <- buildSNNGraph(deng, k = 15) deng25 <- buildSNNGraph(deng, k = 25) par(mfrow=c(1,3)) plot(deng5, vertex.size = 4, vertex.label = NA) title("5-NN" ,line = -33, cex.main = 3) plot(deng15, vertex.size = 4, vertex.label = NA) title("15-NN" ,line = -33, cex.main = 3) plot(deng25, vertex.size = 4, vertex.label = NA) title("25-NN" ,line = -33, cex.main = 3) Perform Louvain clustering: cl <- igraph::cluster_louvain(deng15)$membership colData(deng)$cl <- factor(cl) mclust::adjustedRandIndex(colData(deng)$cell_type1, colData(deng)$cl) ## [1] 0.8248454 Reaches very high similarity with the labels provided in the original paper. However, it tend to merge small clusters into larger ones. table(deng$cell_type1, cl) ## cl ## 1 2 3 ## 16cell 49 0 1 ## 2cell 0 22 0 ## 4cell 0 14 0 ## 8cell 36 0 1 ## blast 0 0 133 ## zygote 0 12 0 10.2.2 Example 2. Graph-based clustering (segerstolpe dataset) muraro <- readRDS("data/pancreas/muraro.rds") ## PCA var.fit <- suppressWarnings(trendVar(muraro, parametric=TRUE, use.spikes=F)) muraro <- suppressWarnings(denoisePCA(muraro, technical=var.fit$trend)) dim(reducedDim(muraro, "PCA")) ## [1] 2126 5 ## Build graph and clustering gr <- buildSNNGraph(muraro, use.dimred="PCA", k = 30) cl <- igraph::cluster_louvain(gr)$membership colData(muraro)$cl <- factor(cl) mclust::adjustedRandIndex(colData(muraro)$cell_type1, colData(muraro)$cl) ## [1] 0.4845618 table(muraro$cell_type1, cl) ## cl ## 1 2 3 4 5 6 7 8 9 ## acinar 0 0 0 0 0 0 218 0 1 ## alpha 202 306 274 5 15 9 1 0 0 ## beta 1 0 0 5 195 21 2 220 4 ## delta 0 0 0 0 18 174 0 1 0 ## ductal 0 0 0 215 0 1 7 3 19 ## endothelial 0 0 0 0 0 0 0 0 21 ## epsilon 0 0 0 0 0 3 0 0 0 ## gamma 1 0 1 0 0 97 2 0 0 ## mesenchymal 0 0 0 1 0 0 0 0 79 ## unclear 0 0 0 4 0 0 0 0 0 10.2.3 Example 3. SC3 Let’s run SC3 clustering on the Deng data. The advantage of the SC3 is that it can directly ingest a SingleCellExperiment object. Now let’s image we do not know the number of clusters k (cell types). SC3 can estimate a number of clusters for you: deng <- sc3_estimate_k(deng) metadata(deng)$sc3$k_estimation Interestingly, the number of cell types predicted by SC3 is smaller than in the original data annotation. However, early, mid and late stages of different cell types together, we will have exactly 6 cell types. We store the merged cell types in cell_type1 column of the colData slot: plotPCA(deng, colour_by = "cell_type1") Now we are ready to run SC3 (we also ask it to calculate biological properties of the clusters): deng <- sc3(deng, ks = 10, biology = TRUE, n_cores = 1) SC3 result consists of several different outputs (please look in (Kiselev et al. 2017) and SC3 vignette for more details). Here we show some of them: Consensus matrix: sc3_plot_consensus(deng, k = 10, show_pdata = "cell_type2") Silhouette plot: sc3_plot_silhouette(deng, k = 10) Heatmap of the expression matrix: sc3_plot_expression(deng, k = 10, show_pdata = "cell_type2") Identified marker genes: sc3_plot_markers(deng, k = 10, show_pdata = "cell_type2") PCA plot with highlighted SC3 clusters: plotPCA(deng, colour_by = "sc3_10_clusters") Compare the results of SC3 clustering with the original publication cell type labels: adjustedRandIndex(colData(deng)$cell_type2, colData(deng)$sc3_10_clusters) Note SC3 can also be run in an interactive Shiny session: sc3_interactive(deng) This command will open SC3 in a web browser. Note Due to direct calculation of distances SC3 becomes very slow when the number of cells is \\(>5000\\). For large datasets containing up to \\(10^5\\) cells we recomment using Seurat (see chapter 16). 10.3 An alternative to clustering: Automatic cell annotation 10.3.1 SingleR 10.3.1.1 Methodology Step1. Find variable gene 1-1. For every gene, obtain median grouped by label. 1-2. Select genes that makes at least one label different:. For example, if we are looking for the genes that makes label “green†different from label “redâ€, we substract the second column by the first, and pick the top \\(N\\) highest and positive values. All analysis onwards use only the selected variable genes. Step2. Spearman’s correlation Spearman’s correlation \\(\\in [-1, 1]\\) is a measure of the strength of a linear or monotonic relationship between paired data. We compute the Spearman’s correlation for all pairs of cells in the test and reference dataset, and obtain an \\(n_{\\text{test}} \\times n_{\\text{ref}}\\) correlation matrix, where \\(n\\) is the number of cells (see the first matrix in Step3). Step3. Scoring We want to know how each cell in the test data is correlated to the labels in the reference data, instead of each reference cell. So we take the correlations of a cell in the test data with all the cells with a certain label in the reference data, and summarize them into one number or a score, in SingleR, the default is to take the \\(80\\%\\) quantile. Step4. Fine tuning We stop here and assign each cell with label that score the highest, actually, if we set the argument fine.tune = FALSE, that is exactly what the package function SingleR does. But there is one more question, what if the second highest score is very close to the highest? 10.3.1.2 Example (Note: SingleR is not yet available in the released version of Bioconductor. It will be possible to run it as shown once the next Bioconductor release is made in late October.) library(scRNAseq) library(SingleR) segerstolpe <- readRDS("data/pancreas/segerstolpe.rds") sceM <- suppressMessages(MuraroPancreasData()) sceM <- sceM[,!is.na(sceM$label)] sceM <- logNormCounts(sceM) ## find common gene rownames(sceM) <- gsub("__.*","",rownames(sceM)) common <- intersect(rownames(sceM), rownames(segerstolpe)) sceM <- sceM[common,] segerstolpe <- segerstolpe[common,] ## Prepare reference out <- pairwiseTTests(logcounts(sceM), sceM$label, direction="up") markers <- getTopMarkers(out$statistics, out$pairs, n=10) ## Annotation pred <- SingleR(test=segerstolpe, ref=sceM, labels=sceM$label, genes=markers) ## View result plotScoreHeatmap(pred, show.labels = TRUE, annotation_col=data.frame( row.names=rownames(pred))) 10.3.2 scmap ## Load data segerstolpe <- readRDS("data/pancreas/segerstolpe.rds") # test library(scRNAseq) sceM <- readRDS("data/pancreas/muraro.rds") # reference rownames(sceM) <- gsub("__.*","",rownames(sceM)) Select the most informative features (genes) using the dropout feature selection method. By default select 500 features. library(scmap) rowData(sceM)$feature_symbol <- rownames(sceM) sceM <- selectFeatures(sceM, suppress_plot = TRUE) Index of a reference dataset is created by finding the median gene expression for each cluster. First, chop the total of 500 features into \\(M = 50\\) chuncks/ low-dimensional subspace. Second, cluster each chunk into \\(k = \\sqrt{N}\\) clusters, where \\(N\\) is the number of cells. By default scmap uses the cell_type1 column of the colData slot in the reference to identify clusters. sceM <- indexCell(sceM) The function indexCluster writes the scmap_cluster_index item of the meta data slot of the reference dataset sceM. This step has two outputs: names(metadata(sceM)$scmap_cell_index) ## [1] "subcentroids" "subclusters" subcentroids returns cluster centers: cat(length(metadata(sceM)$scmap_cell_index$subcentroids), " chunks \\n") ## 50 chunks cat("The dimension of cluster centers in each chunk: ", dim(metadata(sceM)$scmap_cell_index$subcentroids[[1]]), "\\n") ## The dimension of cluster centers in each chunk: 10 46 subclusters contains information about which cluster (label) the cells belong to dim(metadata(sceM)$scmap_cell_index$subclusters) ## [1] 50 2126 metadata(sceM)$scmap_cell_index$subclusters[1:5,1:5] ## D28.1_1 D28.1_13 D28.1_15 D28.1_17 D28.1_2 ## [1,] 13 25 36 1 29 ## [2,] 7 24 19 17 21 ## [3,] 19 35 7 7 36 ## [4,] 38 27 29 38 41 ## [5,] 8 39 24 40 1 Projection: Once the scmap-cell indexes have been generated we can use them to project the test dataset. scmapCell_results <- scmapCell( projection = segerstolpe, index_list = list( sceM = metadata(sceM)$scmap_cell_index ) ) names(scmapCell_results) ## [1] "sceM" The cells matrix contains the top 10 (scmap default) cell IDs of the cells of the reference dataset that a given cell of the projection dataset is closest to: dim(scmapCell_results$sceM$cells) ## [1] 10 3514 Cell annotation: If cell cluster annotation is available for the reference datasets, scmap-cell can also annotate the cells from the projection dataset using the labels of the reference. It does so by looking at the top 3 nearest neighbours (scmap default) and if they all belong to the same cluster in the reference and their maximum similarity is higher than a threshold (0.5 is the scmap default), then a projection cell is assigned to the corresponding reference cluster: scmapCell_clusters <- scmapCell2Cluster( scmapCell_results, list( colData(sceM)$cell_type1 )) Plot result Compare the annotated result with the original label in the segerstolpe dataset. plot( getSankey( segerstolpe$cell_type1, scmapCell_clusters$combined_labs, plot_height = 400 ) ) 10.3.3 sessionInfo() Among the 2126 cells in the data, only 89 are annotated as different labels as the References "], -["trajectory-inference.html", "11 Trajectory inference 11.1 First look at Deng data", " 11 Trajectory inference library(SingleCellExperiment) library(TSCAN) library(M3Drop) library(monocle) library(destiny) library(scater) library(ggplot2) library(ggthemes) library(ggbeeswarm) library(corrplot) library(Polychrome) library(slingshot) library(SLICER) library(ouija) set.seed(1) In many situations, one is studying a process where cells change continuously. This includes, for example, many differentiation processes taking place during development: following a stimulus, cells will change from one cell-type to another. Ideally, we would like to monitor the expression levels of an individual cell over time. Unfortunately, such monitoring is not possible with scRNA-seq since the cell is lysed (destroyed) when the RNA is extracted. Instead, we must sample at multiple time-points and obtain snapshots of the gene expression profiles. Since some of the cells will proceed faster along the differentiation than others, each snapshot may contain cells at varying points along the developmental progression. We use statistical methods to order the cells along one or more trajectories which represent the underlying developmental trajectories, this ordering is referred to as “pseudotimeâ€. In this chapter we will consider five different tools: TSCAN,Slingshot,Monocle and some off-the-shelf methods like PCA, for ordering cells according to their pseudotime development. To illustrate the methods we will be using a dataset on mouse embryonic development (Deng et al. 2014). The dataset consists of 268 cells from 10 different time-points of early mouse development. In this case, there is no need for pseudotime alignment since the cell labels provide information about the development trajectory. Thus, the labels allow us to establish a ground truth so that we can evaluate and compare the different methods. A recent benchmarking paper by Saelens et al (Saelens et al. 2019) provides a detailed summary of the various computational methods for trajectory inference from single-cell transcriptomics (Saelens et al. 2019). They discuss 45 tools and evaluate them across various aspects including accuracy, scalability, and usability. The following figures from the paper summarise several key aspects and some of the features of the tools being evaluated: Figure 2.3: Overview of several key aspects of the evaluation (Fig. 1 from Saelens et al, 2019). The Characterizatics of the 45 TI tools: Figure 2.4: Characterization of trajectory inference methods for single-cell transcriptomics data (Fig. 2 from Saelens et al, 2019). The detailed evaluation results of the 45 TI tools: Figure 2.5: Detailed results of the four main evaluation criteria: accuracy, scalability, stability and usability of trajectory inference methods for single-cell transcriptomics data (Fig. 3 from Saelens et al, 2019). 11.1 First look at Deng data Let us take a first look at the Deng(Deng et al. 2014) data, without yet applying sophisticated pseudotime methods. As the plot below shows, simple PCA does a very good job of displaying the structure in these data. It is only once we reach the blast cell types (“earlyblastâ€, “midblastâ€, “lateblastâ€) that PCA struggles to separate the distinct cell types. deng_SCE <- readRDS("data/deng/deng-reads.rds") deng_SCE$cell_type2 <- factor( deng_SCE$cell_type2, levels = c("zy", "early2cell", "mid2cell", "late2cell", "4cell", "8cell", "16cell", "earlyblast", "midblast", "lateblast") ) cellLabels <- deng_SCE$cell_type2 deng <- counts(deng_SCE) colnames(deng) <- cellLabels deng_SCE <- scater::runPCA(deng_SCE,ncomponent = 5) ## change color Palette with library(Polychrome) set.seed(723451) # for reproducibility my_color <- createPalette(10, c("#010101", "#ff0000"), M=1000) names(my_color) <- unique(as.character(deng_SCE$cell_type2)) pca_df <- data.frame(PC1 = reducedDim(deng_SCE,"PCA")[,1], PC2 = reducedDim(deng_SCE,"PCA")[,2], cell_type2 = deng_SCE$cell_type2) ggplot(data = pca_df)+geom_point(mapping = aes(x = PC1, y = PC2, colour = cell_type2))+ scale_colour_manual(values = my_color)+theme_classic() PCA, here, provides a useful baseline for assessing different pseudotime methods. For a very naive pseudotime we can just take the co-ordinates of the first principal component. #deng_SCE$PC1 <- reducedDim(deng_SCE, "PCA")[,1] ggplot(pca_df, aes(x = PC1, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + scale_colour_manual(values = my_color) + theme_classic() + xlab("First principal component") + ylab("Timepoint") + ggtitle("Cells ordered by first principal component") As the plot above shows, PC1 struggles to correctly order cells early and late in the developmental timecourse, but overall does a relatively good job of ordering cells by developmental time. Can bespoke pseudotime methods do better than naive application of PCA? 11.1.1 TSCAN TSCAN (Ji and Ji 2019) combines clustering with pseudotime analysis. First it clusters the cells using mclust, which is based on a mixture of normal distributions. Then it builds a minimum spanning tree to connect the clusters. The branch of this tree that connects the largest number of clusters is the main branch which is used to determine pseudotime. Note From a connected graph with weighted edges, MST is the tree structure that connects all the nodes in a way that has the minimum total edge weight. The trajectory inference methods that use MST is based on the idea that nodes (cells/clusters of cells) and their connections represent the geometric shape of the data cloud in a two-dimenension space. First we will try to use all genes to order the cells. procdeng <- TSCAN::preprocess(counts(deng_SCE)) colnames(procdeng) <- 1:ncol(deng_SCE) dengclust <- TSCAN::exprmclust(procdeng, clusternum = 10) TSCAN::plotmclust(dengclust) dengorderTSCAN <- TSCAN::TSCANorder(dengclust, orderonly = FALSE) pseudotime_order_tscan <- as.character(dengorderTSCAN$sample_name) deng_SCE$pseudotime_order_tscan <- NA deng_SCE$pseudotime_order_tscan[as.numeric(dengorderTSCAN$sample_name)] <- dengorderTSCAN$Pseudotime Frustratingly, TSCAN only provides pseudotime values for 221 of 268 cells, silently returning missing values for non-assigned cells. Again, we examine which timepoints have been assigned to each state: cellLabels[dengclust$clusterid == 10] ## [1] late2cell late2cell late2cell late2cell late2cell late2cell late2cell ## [8] late2cell late2cell late2cell ## 10 Levels: zy early2cell mid2cell late2cell 4cell 8cell ... lateblast ggplot(as.data.frame(colData(deng_SCE)), aes(x = pseudotime_order_tscan, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + scale_color_manual(values = my_color) + theme_classic() + xlab("TSCAN pseudotime") + ylab("Timepoint") + ggtitle("Cells ordered by TSCAN pseudotime") TSCAN gets the development trajectory the “wrong way aroundâ€, in the sense that later pseudotime values correspond to early timepoints and vice versa. This is not inherently a problem (it is easy enough to reverse the ordering to get the intuitive interpretation of pseudotime), but overall it would be a stretch to suggest that TSCAN performs better than PCA on this dataset. (As it is a PCA-based method, perhaps this is not entirely surprising.) Exercise 1 Compare results for different numbers of clusters (clusternum). 11.1.2 Slingshot Slingshot (Street et al. 2018) is a single-cell lineage inference tool, it can work with datasets with multiple branches. Slingshot has two stages: 1) the inference of the global lineage structure using MST on clustered data points and 2) the inference of pseudotime variables for cells along each lineage by fitting simultaneous ‘principal curves’ across multiple lineages. Slingshot’s first stage uses a cluster-based MST to stably identify the key elements of the global lineage structure, i.e., the number of lineages and where they branch. This allows us to identify novel lineages while also accommodating the use of domain-specific knowledge to supervise parts of the tree (e.g., terminal cellular states). For the second stage, we propose a novel method called simultaneous principal curves, to fit smooth branching curves to these lineages, thereby translating the knowledge of global lineage structure into stable estimates of the underlying cell-level pseudotime variable for each lineage. Slingshot had consistently performing well across different datasets as reported by Saelens et al, let’s have a run for the deng dataset. It is recommended by Slingshot to run in a reduced dimensions. __Note_ Principal curves are smooth one-dimensional curves that pass through the middle of a p-dimensional data set, providing a nonlinear summary of the data. They are nonparametric, and their shape is suggested by the data (Hastie et al)(Hastie and Stuetzle 1989). ## runing slingshot deng_SCE <- slingshot(deng_SCE, clusterLabels = 'cell_type2',reducedDim = "PCA", allow.breaks = FALSE) ## Using diagonal covariance matrix summary(deng_SCE$slingPseudotime_1) ## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's ## 0.00 52.19 59.81 60.34 81.60 85.72 55 ## get lineages inferred by slingshot lnes <- getLineages(reducedDim(deng_SCE,"PCA"), deng_SCE$cell_type2) ## Using diagonal covariance matrix lnes@lineages ## $Lineage1 ## [1] "zy" "early2cell" "mid2cell" "late2cell" "4cell" ## [6] "16cell" "midblast" "earlyblast" ## ## $Lineage2 ## [1] "zy" "early2cell" "mid2cell" "late2cell" "4cell" ## [6] "16cell" "midblast" "lateblast" ## ## $Lineage3 ## [1] "zy" "early2cell" "mid2cell" "late2cell" "4cell" ## [6] "16cell" "8cell" ## plot the lineage overlay on the orginal PCA plot plot(reducedDims(deng_SCE)$PCA, col = my_color[as.character(deng_SCE$cell_type2)], pch=16, asp = 1) legend("bottomleft",legend = names(my_color[levels(deng_SCE$cell_type2)]), fill = my_color[levels(deng_SCE$cell_type2)]) lines(SlingshotDataSet(deng_SCE), lwd=2, type = 'lineages', col = c("black")) ## Plotting the pseudotime inferred by slingshot by cell types slingshot_df <- data.frame(colData(deng_SCE)) ggplot(slingshot_df, aes(x = slingPseudotime_1, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + theme_classic() + xlab("First Slingshot pseudotime") + ylab("cell type") + ggtitle("Cells ordered by Slingshot pseudotime")+scale_colour_manual(values = my_color) ggplot(slingshot_df, aes(x = slingPseudotime_2, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + theme_classic() + xlab("Second Slingshot pseudotime") + ylab("cell type") + ggtitle("Cells ordered by Slingshot pseudotime")+scale_colour_manual(values = my_color) ggplot(slingshot_df, aes(x = slingPseudotime_1, y = slingPseudotime_2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + theme_classic() + xlab("First Slingshot pseudotime") + ylab("Second Slingshot pseudotime") + ggtitle("Cells ordered by Slingshot pseudotime")+scale_colour_manual(values = my_color) # # ggplot(slingshot_df, aes(x = slingPseudotime_1, y = slingPseudotime_2, # colour = slingPseudotime_3)) + # geom_point() + theme_classic() + # xlab("First Slingshot pseudotime") + ylab("Second Slingshot pseudotime") + # ggtitle("Cells ordered by Slingshot pseudotime")+facet_wrap(.~cell_type2) Note You can also supply a start and an end cluster to slingshot. Comments Did you notice the ordering of clusters in the lineage prediced for 16cells state? There is an outlier-like cell in the 16cell group, find the outlier and remove it, then re-run Slingshot. 11.1.3 Monocle The original Monocle (Trapnell et al. 2014) method skips the clustering stage of TSCAN and directly builds a minimum spanning tree on a reduced dimension representation (using ‘ICA’) of the cells to connect all cells. Monocle then identifies the longest path in this tree as the main branch and uses this to determine pseudotime. Priors are required such as start/end state and the number of branching events. If the data contains diverging trajectories (i.e. one cell type differentiates into two different cell-types), monocle can identify these. Each of the resulting forked paths is defined as a separate cell state. 11.1.4 Monocle 2 Monocle 2 (Qiu et al. 2017) uses a different approach, with dimensionality reduction and ordering performed by reverse graph embedding (RGE), allowing it to detect branching events in an unsupervised manner. RGE, a machine-learning strategy, learns a ‘principal graph’ to describe the single-cell dataset. RGE also learns the mapping function of data points on the trajectory back to the original high dimentional space simutaneously. In doing so, it aims to position the latent points in the lower dimension space (along the trajectory) while also ensuring their corresponding positions in the input dimension are ‘neighbors’. There are different ways of implementing the RGE framework, Monocle 2 uses DDRTree(Discriminative dimensionality reduction via learning a tree) by default. DDRTree learns latent points and the projection of latent points to the points in original input space, which is equivalent to “dimension reductionâ€. In addition, it simutanously learns ‘principal graph’ for K-means soft clustered cetroids for the latent points. Principal graph is the spanning tree of those centroids. DDRTree returns a principal tree of the centroids of cell clusters in low dimension, pseudotime is derived for individual cells by calculating geomdestic distance of their projections onto the tree from the root (user-defined or arbitrarily assigned). Note Informally, a principal graph is like a principal curve which passes through the ‘middle’ of a data set but is allowed to have branches. library(monocle) #d <- deng_SCE[m3dGenes,] ## feature selection deng <- counts(deng_SCE) m3dGenes <- as.character( M3DropFeatureSelection(deng)$Gene ) d <- deng_SCE[which(rownames(deng_SCE) %in% m3dGenes), ] d <- d[!duplicated(rownames(d)), ] colnames(d) <- 1:ncol(d) geneNames <- rownames(d) rownames(d) <- 1:nrow(d) pd <- data.frame(timepoint = cellLabels) pd <- new("AnnotatedDataFrame", data=pd) fd <- data.frame(gene_short_name = geneNames) fd <- new("AnnotatedDataFrame", data=fd) dCellData <- newCellDataSet(counts(d), phenoData = pd, featureData = fd) # dCellData <- setOrderingFilter(dCellData, which(geneNames %in% m3dGenes)) dCellData <- estimateSizeFactors(dCellData) dCellDataSet <- reduceDimension(dCellData,reduction_method = "DDRTree", pseudo_expr = 1) dCellDataSet <- orderCells(dCellDataSet, reverse = FALSE) plot_cell_trajectory(dCellDataSet) # Store the ordering pseudotime_monocle2 <- data.frame( Timepoint = phenoData(dCellDataSet)$timepoint, pseudotime = phenoData(dCellDataSet)$Pseudotime, State = phenoData(dCellDataSet)$State ) rownames(pseudotime_monocle2) <- 1:ncol(d) pseudotime_order_monocle <- rownames(pseudotime_monocle2[order(pseudotime_monocle2$pseudotime), ]) Note check other available methods for ?reduceDimension We can again compare the inferred pseudotime to the known sampling timepoints. deng_SCE$pseudotime_monocle2 <- pseudotime_monocle2$pseudotime ggplot(as.data.frame(colData(deng_SCE)), aes(x = pseudotime_monocle2, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + scale_color_manual(values = my_color) + theme_classic() + xlab("monocle2 pseudotime") + ylab("Timepoint") + ggtitle("Cells ordered by monocle2 pseudotime") Monocle 2 performs pretty well on these cells. 11.1.5 Monocle 3 Monocle3(Cao et al. 2019) is the updated single-cell analysis toolkit for analysing large datasets. Monocle 3 is designed for use with absolute transcript counts (e.g. from UMI experiments). It first does dimension reduction with UMAP and then clusters the cells with Louvian/Leiden algorithms and merge adjacent groups into supergroup, and finaly resovles the trajectories individual cells can take during development, identifies the locations of branches and convergences within each supergroup. In short, Monocle3 uses UMAP to construct a initial trajectory inference and refines it with learning principal graph. It builds KNN graph in the UMAP dimensions and runs Louvain/Leiden algorithms om the KNN graph to derive communities; edges are drawn to connect communities that have more links (Partitioned Approximate Graph Abstraction (PAGA) graph). Each component of the PAGA grah is passed to the next step which is learning principal graph based on the SimplePPT algorithm. The pseudotime is calculated for individual cells by projecting the cells to their nearest point on the principal graph edge and measure geodesic distance along of principal points to the closest of their root nodes. library(monocle3) ## ## Attaching package: 'monocle3' ## The following objects are masked from 'package:monocle': ## ## plot_genes_in_pseudotime, plot_genes_violin, ## plot_pc_variance_explained ## The following objects are masked from 'package:Biobase': ## ## exprs, fData, fData<-, pData, pData<- gene_meta <- rowData(deng_SCE) #gene_metadata must contain a column verbatim named 'gene_short_name' for certain functions. gene_meta$gene_short_name <- rownames(gene_meta) cds <- new_cell_data_set(expression_data = counts(deng_SCE), cell_metadata = colData(deng_SCE), gene_metadata = gene_meta) ## Step 1: Normalize and pre-process the data cds <- preprocess_cds(cds,num_dim = 5) plot_pc_variance_explained(cds) ## Step 3: Reduce the dimensions using UMAP cds <- reduce_dimension(cds) ## No preprocess_method specified, using preprocess_method = 'PCA' ## Step 4: Cluster the cells cds <- cluster_cells(cds) ## change the clusters ## cds@clusters$UMAP$clusters <- deng_SCE$cell_type2 ## Step 5: Learn a graph cds <- learn_graph(cds,use_partition = TRUE) ## Step 6: Order cells cds <- order_cells(cds, root_cells = c("zy","zy.1","zy.2","zy.3") ) plot_cells(cds, color_cells_by="cell_type2", graph_label_size = 4, cell_size = 2, group_label_size = 6)+ scale_color_manual(values = my_color) plot_cells(cds, graph_label_size = 6, cell_size = 1, color_cells_by="pseudotime", group_label_size = 6) ## Cells aren't colored in a way that allows them to be grouped. pdata_cds <- pData(cds) pdata_cds$pseudotime_monocle3 <- monocle3::pseudotime(cds) ggplot(as.data.frame(pdata_cds), aes(x = pseudotime_monocle3, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + scale_color_manual(values = my_color) + theme_classic() + xlab("monocle3 pseudotime") + ylab("Timepoint") + ggtitle("Cells ordered by monocle3 pseudotime") deng_SCE$pseudotime_monocle3 <- pdata_cds$pseudotime_monocle3 It did not work well for our small Smart-seq2 dataset. 11.1.6 Diffusion maps Diffusion maps were introduced by Ronald Coifman and Stephane Lafon(Coifman and Lafon 2006), and the underlying idea is to assume that the data are samples from a diffusion process. The method infers the low-dimensional manifold by estimating the eigenvalues and eigenvectors for the diffusion operator related to the data. Angerer et al(Angerer et al. 2016) have applied the diffusion maps concept to the analysis of single-cell RNA-seq data to create an R package called destiny. We will take the ranko prder of cells in the first diffusion map component as “diffusion map pseudotime†here. deng <- logcounts(deng_SCE) colnames(deng) <- cellLabels dm <- DiffusionMap(t(deng)) tmp <- data.frame(DC1 = eigenvectors(dm)[,1], DC2 = eigenvectors(dm)[,2], Timepoint = deng_SCE$cell_type2) ggplot(tmp, aes(x = DC1, y = DC2, colour = Timepoint)) + geom_point() + scale_color_manual(values = my_color) + xlab("Diffusion component 1") + ylab("Diffusion component 2") + theme_classic() deng_SCE$pseudotime_diffusionmap <- rank(eigenvectors(dm)[,1]) ggplot(as.data.frame(colData(deng_SCE)), aes(x = pseudotime_diffusionmap, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + scale_color_manual(values = my_color) + theme_classic() + xlab("Diffusion map pseudotime (first diffusion map component)") + ylab("Timepoint") + ggtitle("Cells ordered by diffusion map pseudotime") Like the other methods, using the first diffusion map component from destiny as pseudotime does a good job at ordering the early time-points (if we take high values as “earlier†in developement), but it is unable to distinguish the later ones. Exercise 2 Do you get a better resolution between the later time points by considering additional eigenvectors? Exercise 3 How does the ordering change if you only use the genes identified by M3Drop? 11.1.7 Other methods 11.1.7.1 SLICER The SLICER(Welch, Hartemink, and Prins 2016) method is an algorithm for constructing trajectories that describe gene expression changes during a sequential biological process, just as Monocle and TSCAN are. SLICER is designed to capture highly nonlinear gene expression changes, automatically select genes related to the process, and detect multiple branch and loop features in the trajectory (Welch, Hartemink, and Prins 2016). The SLICER R package is available from its GitHub repository and can be installed from there using the devtools package. We use the select_genes function in SLICER to automatically select the genes to use in builing the cell trajectory. The function uses “neighbourhood variance†to identify genes that vary smoothly, rather than fluctuating randomly, across the set of cells. Following this, we determine which value of “k†(number of nearest neighbours) yields an embedding that most resembles a trajectory. Then we estimate the locally linear embedding of the cells. library("lle") slicer_genes <- select_genes(t(deng)) k <- select_k(t(deng[slicer_genes,]), kmin = 30, kmax=60) ## finding neighbours ## calculating weights ## computing coordinates ## finding neighbours ## calculating weights ## computing coordinates ## finding neighbours ## calculating weights ## computing coordinates ## finding neighbours ## calculating weights ## computing coordinates ## finding neighbours ## calculating weights ## computing coordinates ## finding neighbours ## calculating weights ## computing coordinates ## finding neighbours ## calculating weights ## computing coordinates slicer_traj_lle <- lle(t(deng[slicer_genes,]), m = 2, k)$Y ## finding neighbours ## calculating weights ## computing coordinates reducedDim(deng_SCE, "LLE") <- slicer_traj_lle plot_df <- data.frame(slicer1 = reducedDim(deng_SCE, "LLE")[,1], slicer2 = reducedDim(deng_SCE, "LLE")[,2], cell_type2 = deng_SCE$cell_type2) ggplot(data = plot_df)+geom_point(mapping = aes(x = slicer1, y = slicer2, color = cell_type2))+ scale_color_manual(values = my_color)+ xlab("LLE component 1") + ylab("LLE component 2") + ggtitle("Locally linear embedding of cells from SLICER")+ theme_classic() With the locally linear embedding computed we can construct a k-nearest neighbour graph that is fully connected. This plot displays a (yellow) circle for each cell, with the cell ID number overlaid in blue. Here we show the graph computed using 10 nearest neighbours. Here, SLICER appears to detect one major trajectory with one branch. slicer_traj_graph <- conn_knn_graph(slicer_traj_lle, 10) plot(slicer_traj_graph, main = "Fully connected kNN graph from SLICER") From this graph we can identify “extreme†cells that are candidates for start/end cells in the trajectory. ends <- find_extreme_cells(slicer_traj_graph, slicer_traj_lle) start <- ends[1] Having defined a start cell we can order the cells in the estimated pseudotime. pseudotime_order_slicer <- cell_order(slicer_traj_graph, start) branches <- assign_branches(slicer_traj_graph, start) pseudotime_slicer <- data.frame( Timepoint = cellLabels, pseudotime = NA, State = branches ) pseudotime_slicer$pseudotime[pseudotime_order_slicer] <- 1:length(pseudotime_order_slicer) deng_SCE$pseudotime_slicer <- pseudotime_slicer$pseudotime We can again compare the inferred pseudotime to the known sampling timepoints. SLICER does not provide a pseudotime value per se, just an ordering of cells. ggplot(as.data.frame(colData(deng_SCE)), aes(x = pseudotime_slicer, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + scale_color_manual(values = my_color) + theme_classic() + xlab("SLICER pseudotime (cell ordering)") + ylab("Timepoint") + theme_classic() Like the previous method, SLICER (Welch, Hartemink, and Prins 2016) here provides a good ordering for the early time points. It places “16cell†cells before “8cell†cells, but provides better ordering for blast cells than many of the earlier methods. Exercise 4 How do the results change for different k? (e.g. k = 5) What about changing the number of nearest neighbours in the call to conn_knn_graph? Exercise 5 How does the ordering change if you use a different set of genes from those chosen by SLICER (e.g. the genes identified by M3Drop)? 11.1.7.2 Ouija Ouija (http://kieranrcampbell.github.io/ouija/) takes a different approach from the pseudotime estimation methods we have looked at so far. Earlier methods have all been “unsupervisedâ€, which is to say that apart from perhaps selecting informative genes we do not supply the method with any prior information about how we expect certain genes or the trajectory as a whole to behave. Ouija, in contrast, is a probabilistic framework that allows for interpretable learning of single-cell pseudotimes using only small panels of marker genes. This method: infers pseudotimes from a small number of marker genes letting you understand why the pseudotimes have been learned in terms of those genes; provides parameter estimates (with uncertainty) for interpretable gene regulation behaviour (such as the peak time or the upregulation time); has a Bayesian hypothesis test to find genes regulated before others along the trajectory; identifies metastable states, ie discrete cell types along the continuous trajectory. We will supply the following marker genes to Ouija (with timepoints where they are expected to be highly expressed): Early timepoints: Dazl, Rnf17, Sycp3, Nanog, Pou5f1, Fgf8, Egfr, Bmp5, Bmp15 Mid timepoints: Zscan4b, Foxa1, Prdm14, Sox21 Late timepoints: Creb3, Gpx4, Krt8, Elf5, Eomes, Cdx2, Tdgf1, Gdf3 With Ouija we can model genes as either exhibiting monotonic up or down regulation (known as switch-like behaviour), or transient behaviour where the gene briefly peaks. By default, Ouija assumes all genes exhibit switch-like behaviour (the authors assure us not to worry if we get it wrong - the noise model means incorrectly specifying a transient gene as switch-like has minimal effect). Here we can “cheat†a little and check that our selected marker genes do actually identify different timepoints of the differentiation process. ouija_markers_down <- c("Dazl", "Rnf17", "Sycp3", "Fgf8", "Egfr", "Bmp5", "Bmp15", "Pou5f1") ouija_markers_up <- c("Creb3", "Gpx4", "Krt8", "Elf5", "Cdx2", "Tdgf1", "Gdf3", "Eomes") ouija_markers_transient <- c("Zscan4b", "Foxa1", "Prdm14", "Sox21") ouija_markers <- c(ouija_markers_down, ouija_markers_up, ouija_markers_transient) plotExpression(deng_SCE, ouija_markers, x = "cell_type2", colour_by = "cell_type2") + theme(axis.text.x = element_text(angle = 60, hjust = 1)) In order to fit the pseudotimes wesimply call ouija, passing in the expected response types. Note that if no response types are provided then they are all assumed to be switch-like by default, which we will do here. The input to Ouija can be a cell-by-gene matrix of non-negative expression values, or an ExpressionSet object, or, happily, by selecting the logcounts values from a SingleCellExperiment object. We can apply prior information about whether genes are up- or down-regulated across the differentiation process, and also provide prior information about when the switch in expression or a peak in expression is likely to occur. We can fit the Ouija model using either: Hamiltonian Monte Carlo (HMC) - full MCMC inference where gradient information of the log-posterior is used to “guide†the random walk through the parameter space, or Automatic Differentiation Variational Bayes (ADVI or simply VI) - approximate inference where the KL divergence to an approximate distribution is minimised. In general, HMC will provide more accurate inference with approximately correct posterior variance for all parameters. However, VB is orders of magnitude quicker than HMC and while it may underestimate posterior variance, the Ouija authors suggest that anecdotally it often performs as well as HMC for discovering posterior pseudotimes. To help the Ouija model, we provide it with prior information about the strength of switches for up- and down-regulated genes. By setting switch strength to -10 for down-regulated genes and 10 for up-regulated genes with a prior strength standard deviation of 0.5 we are telling the model that we are confident about the expected behaviour of these genes across the differentiation process. options(mc.cores = parallel::detectCores()) response_type <- c(rep("switch", length(ouija_markers_down) + length(ouija_markers_up)), rep("transient", length(ouija_markers_transient))) switch_strengths <- c(rep(-10, length(ouija_markers_down)), rep(10, length(ouija_markers_up))) switch_strength_sd <- c(rep(0.5, length(ouija_markers_down)), rep(0.5, length(ouija_markers_up))) garbage <- capture.output( oui_vb <- ouija(deng_SCE[ouija_markers,], single_cell_experiment_assay = "logcounts", response_type = response_type, switch_strengths = switch_strengths, switch_strength_sd = switch_strength_sd, inference_type = "vb") ) print(oui_vb) ## A Ouija fit with 268 cells and 20 marker genes ## Inference type: Variational Bayes ## (Gene behaviour) Switch/transient: 16 / 4 We can plot the gene expression over pseudotime along with the maximum a posteriori (MAP) estimates of the mean function (the sigmoid or Gaussian transient function) using the plot_expression function. plot_expression(oui_vb) We can also visualise when in the trajectory gene regulation behaviour occurs, either in the form of the switch time or the peak time (for switch-like or transient genes) using the plot_switch_times and plot_transient_times functions: plot_switch_times(oui_vb) plot_peak_times(oui_vb) Identify metastable states using consistency matrices. cmo <- consistency_matrix(oui_vb) plot_consistency(oui_vb) cell_classifications <- cluster_consistency(cmo) map_pst <- map_pseudotime(oui_vb) ouija_pseudotime <- data.frame(map_pst, cell_classifications) ggplot(ouija_pseudotime, aes(x = map_pst, y = cell_classifications)) + geom_point() + xlab("MAP pseudotime") + ylab("Cell classification") deng_SCE$pseudotime_ouija <- ouija_pseudotime$map_pst deng_SCE$ouija_cell_class <- ouija_pseudotime$cell_classifications ggplot(as.data.frame(colData(deng_SCE)), aes(x = pseudotime_ouija, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + scale_color_manual(values = my_color) + theme_classic() + xlab("Ouija pseudotime") + ylab("Timepoint") + theme_classic() Ouija does quite well in the ordering of the cells here, although it can be sensitive to the choice of marker genes and prior information supplied. How do the results change if you select different marker genes or change the priors? Ouija identifies four metastable states here, which we might annotate as “zygote/2cellâ€, “4/8/16 cellâ€, “blast1†and “blast2â€. ggplot(as.data.frame(colData(deng_SCE)), aes(x = as.factor(ouija_cell_class), y = pseudotime_ouija, colour = cell_type2)) + geom_boxplot() + coord_flip() + scale_color_manual(values = my_color) + theme_classic() + xlab("Ouija cell classification") + ylab("Ouija pseudotime") + theme_classic() A common analysis is to work out the regulation orderings of genes. For example, is gene A upregulated before gene B? Does gene C peak before the downregulation of gene D? Ouija answers these questions in terms of a Bayesian hypothesis test of whether the difference in regulation timing (either switch time or peak time) is significantly different to 0. This is collated using the gene_regulation function. gene_regs <- gene_regulation(oui_vb) head(gene_regs) ## # A tibble: 6 x 7 ## # Groups: label, gene_A [6] ## label gene_A gene_B mean_difference lower_95 upper_95 significant ## <chr> <chr> <chr> <dbl> <dbl> <dbl> <lgl> ## 1 Bmp15 - Cdx2 Bmp15 Cdx2 -0.0631 -0.109 -0.0133 TRUE ## 2 Bmp15 - Creb3 Bmp15 Creb3 0.269 0.201 0.321 TRUE ## 3 Bmp15 - Elf5 Bmp15 Elf5 -0.678 -0.718 -0.644 TRUE ## 4 Bmp15 - Eomes Bmp15 Eomes 0.0822 0.00272 0.156 TRUE ## 5 Bmp15 - Foxa1 Bmp15 Foxa1 -0.0211 -0.0508 0.0120 FALSE ## 6 Bmp15 - Gdf3 Bmp15 Gdf3 0.0644 0.0163 0.126 TRUE What conclusions can you draw from the gene regulation output from Ouija? If you have time, you might try the HMC inference method and see if that changes the Ouija results in any way. 11.1.8 Comparison of the methods How do the trajectories inferred by TSCAN, Monocle, Diffusion Map, SLICER and Ouija compare? TSCAN and Diffusion Map methods get the trajectory the “wrong way roundâ€, so we’ll adjust that for these comparisons. df_pseudotime <- as.data.frame( colData(deng_SCE)[, grep("pseudotime", colnames(colData(deng_SCE)))] ) colnames(df_pseudotime) <- gsub("pseudotime_", "", colnames(df_pseudotime)) df_pseudotime$PC1 <- reducedDim(deng_SCE,"PCA")[,1] df_pseudotime$order_tscan <- -df_pseudotime$order_tscan #df_pseudotime$diffusionmap <- df_pseudotime$diffusionmap df_pseudotime$slingshot1 <- colData(deng_SCE)$slingPseudotime_1 corrplot.mixed(cor(df_pseudotime, use = "na.or.complete"), order = "hclust", tl.col = "black", main = "Correlation matrix for pseudotime results", mar = c(0, 0, 3.1, 0)) We see here that Ouija, TSCAN and SLICER all give trajectories that are similar and strongly correlated with PC1. Diffusion Map is less strongly correlated with these methods, and Monocle gives very different results. 11.1.9 Expression of genes through time Each package also enables the visualization of expression through pseudotime. Following individual genes is very helpful for identifying genes that play an important role in the differentiation process. We illustrate the procedure using the Nanog gene. We have added the pseudotime values computed with all methods here to the colData slot of an SCE object. Having done that, the full plotting capabilities of the scater package can be used to investigate relationships between gene expression, cell populations and pseudotime. This is particularly useful for the packages such as SLICER that do not provide plotting functions. Principal components deng_SCE$PC1 <- reducedDim(deng_SCE,"PCA")[,1] plotExpression(deng_SCE, "Nanog", x = "PC1", colour_by = "cell_type2", show_violin = FALSE, show_smooth = TRUE) TSCAN plotExpression(deng_SCE, "Nanog", x = "pseudotime_order_tscan", colour_by = "cell_type2", show_violin = FALSE, show_smooth = TRUE) Monocle plotExpression(deng_SCE, "Nanog", x = "pseudotime_monocle2", colour_by = "cell_type2", show_violin = FALSE, show_smooth = TRUE) Diffusion Map plotExpression(deng_SCE, "Nanog", x = "pseudotime_diffusionmap", colour_by = "cell_type2", show_violin = FALSE, show_smooth = TRUE) SLICER plotExpression(deng_SCE, "Nanog", x = "pseudotime_slicer", colour_by = "cell_type2", show_violin = FALSE, show_smooth = TRUE) Ouija plotExpression(deng_SCE, "Nanog", x = "pseudotime_ouija", colour_by = "cell_type2", show_violin = FALSE, show_smooth = TRUE) How many of these methods outperform the naive approach of using the first principal component to represent pseudotime for these data? Exercise 7: Repeat the exercise using a subset of the genes, e.g. the set of highly variable genes that can be obtained using Brennecke_getVariableGenes() 11.1.10 dynverse https://dynverse.org/users/2-quick_start/ library(dyno) library(tidyverse) # Reproduces the guidelines as created in the shiny app answers <- dynguidelines::answer_questions( multiple_disconnected = FALSE, expect_topology = TRUE, expected_topology = "linear", n_cells = 3000, n_features = 10000, memory = "100GB", docker = FALSE ) guidelines <- dynguidelines::guidelines(answers = answers) guidelines deng_dataset <- wrap_expression( counts = counts(deng_SCE), expression = assay(deng_SCE,"logcounts") ) model <- infer_trajectory(deng_dataset, first(guidelines$methods_selected)) ## Loading required namespace: hdf5r model <- model %>% add_dimred(dyndimred::dimred_mds, expression_source = deng_dataset$expression) plot_dimred( model, expression_source = deng_dataset$expression, grouping = deng_SCE$cell_type2 ) 11.1.11 sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] splines parallel stats4 stats graphics grDevices utils ## [8] datasets methods base ## ## other attached packages: ## [1] rstan_2.19.2 StanHeaders_2.19.0 ## [3] lle_1.1 snowfall_1.84-6.1 ## [5] snow_0.4-3 MASS_7.3-51.1 ## [7] scatterplot3d_0.3-41 monocle3_0.2.0 ## [9] ouija_0.99.0 Rcpp_1.0.2 ## [11] SLICER_0.2.0 slingshot_1.2.0 ## [13] princurve_2.1.4 Polychrome_1.2.3 ## [15] corrplot_0.84 ggbeeswarm_0.6.0 ## [17] ggthemes_4.2.0 scater_1.12.2 ## [19] destiny_2.14.0 monocle_2.12.0 ## [21] DDRTree_0.1.5 irlba_2.3.3 ## [23] VGAM_1.1-1 ggplot2_3.2.1 ## [25] Matrix_1.2-17 M3Drop_1.10.0 ## [27] numDeriv_2016.8-1.1 TSCAN_1.22.0 ## [29] SingleCellExperiment_1.6.0 SummarizedExperiment_1.14.1 ## [31] DelayedArray_0.10.0 BiocParallel_1.18.1 ## [33] matrixStats_0.55.0 Biobase_2.44.0 ## [35] GenomicRanges_1.36.1 GenomeInfoDb_1.20.0 ## [37] IRanges_2.18.3 S4Vectors_0.22.1 ## [39] BiocGenerics_0.30.0 ## ## loaded via a namespace (and not attached): ## [1] rgl_0.100.30 rsvd_1.0.2 ## [3] vcd_1.4-4 Hmisc_4.2-0 ## [5] zinbwave_1.6.0 corpcor_1.6.9 ## [7] ps_1.3.0 class_7.3-15 ## [9] foreach_1.4.7 lmtest_0.9-37 ## [11] glmnet_2.0-18 crayon_1.3.4 ## [13] laeken_0.5.0 nlme_3.1-139 ## [15] backports_1.1.4 qlcMatrix_0.9.7 ## [17] rlang_0.4.0 XVector_0.24.0 ## [19] readxl_1.3.1 callr_3.3.2 ## [21] limma_3.40.6 phylobase_0.8.6 ## [23] smoother_1.1 manipulateWidget_0.10.0 ## [25] bit64_0.9-7 loo_2.1.0 ## [27] glue_1.3.1 pheatmap_1.0.12 ## [29] rngtools_1.4 splancs_2.01-40 ## [31] processx_3.4.1 vipor_0.4.5 ## [33] AnnotationDbi_1.46.1 haven_2.1.1 ## [35] tidyselect_0.2.5 rio_0.5.16 ## [37] XML_3.98-1.20 tidyr_1.0.0 ## [39] zoo_1.8-6 xtable_1.8-4 ## [41] magrittr_1.5 evaluate_0.14 ## [43] bibtex_0.4.2 cli_1.1.0 ## [45] zlibbioc_1.30.0 rstudioapi_0.10 ## [47] miniUI_0.1.1.1 sp_1.3-1 ## [49] rpart_4.1-15 locfdr_1.1-8 ## [51] RcppEigen_0.3.3.5.0 shiny_1.3.2 ## [53] BiocSingular_1.0.0 xfun_0.9 ## [55] leidenbase_0.1.0 inline_0.3.15 ## [57] pkgbuild_1.0.5 cluster_2.1.0 ## [59] caTools_1.17.1.2 sgeostat_1.0-27 ## [61] tibble_2.1.3 ggrepel_0.8.1 ## [63] ape_5.3 stabledist_0.7-1 ## [65] zeallot_0.1.0 withr_2.1.2 ## [67] bitops_1.0-6 slam_0.1-45 ## [69] ranger_0.11.2 plyr_1.8.4 ## [71] cellranger_1.1.0 pcaPP_1.9-73 ## [73] sparsesvd_0.2 coda_0.19-3 ## [75] e1071_1.7-2 RcppParallel_4.4.3 ## [77] pillar_1.4.2 gplots_3.0.1.1 ## [79] reldist_1.6-6 kernlab_0.9-27 ## [81] TTR_0.23-5 ellipsis_0.3.0 ## [83] tripack_1.3-8 DelayedMatrixStats_1.6.1 ## [85] xts_0.11-2 vctrs_0.2.0 ## [87] NMF_0.21.0 tools_3.6.0 ## [89] foreign_0.8-70 rncl_0.8.3 ## [91] beeswarm_0.2.3 munsell_0.5.0 ## [93] proxy_0.4-23 HSMMSingleCell_1.4.0 ## [95] compiler_3.6.0 abind_1.4-5 ## [97] httpuv_1.5.2 pkgmaker_0.27 ## [99] GenomeInfoDbData_1.2.1 gridExtra_2.3 ## [101] edgeR_3.26.8 lattice_0.20-38 ## [103] deldir_0.1-23 utf8_1.1.4 ## [105] later_0.8.0 dplyr_0.8.3 ## [107] jsonlite_1.6 scales_1.0.0 ## [109] docopt_0.6.1 carData_3.0-2 ## [111] genefilter_1.66.0 lazyeval_0.2.2 ## [113] promises_1.0.1 spatstat_1.61-0 ## [115] car_3.0-3 doParallel_1.0.15 ## [117] latticeExtra_0.6-28 R.utils_2.9.0 ## [119] goftest_1.1-1 spatstat.utils_1.13-0 ## [121] checkmate_1.9.4 cowplot_1.0.0 ## [123] rmarkdown_1.15 openxlsx_4.1.0.1 ## [125] statmod_1.4.32 webshot_0.5.1 ## [127] Rtsne_0.15 forcats_0.4.0 ## [129] copula_0.999-19.1 softImpute_1.4 ## [131] uwot_0.1.4 igraph_1.2.4.1 ## [133] HDF5Array_1.12.2 survival_2.43-3 ## [135] yaml_2.2.0 htmltools_0.3.6 ## [137] memoise_1.1.0 locfit_1.5-9.1 ## [139] viridisLite_0.3.0 digest_0.6.21 ## [141] assertthat_0.2.1 mime_0.7 ## [143] densityClust_0.3 registry_0.5-1 ## [145] RSQLite_2.1.2 data.table_1.12.2 ## [147] blob_1.2.0 R.oo_1.22.0 ## [149] RNeXML_2.3.0 labeling_0.3 ## [151] fastICA_1.2-2 Formula_1.2-3 ## [153] Rhdf5lib_1.6.1 RCurl_1.95-4.12 ## [155] hms_0.5.1 rhdf5_2.28.0 ## [157] colorspace_1.4-1 base64enc_0.1-3 ## [159] nnet_7.3-12 ADGofTest_0.3 ## [161] mclust_5.4.5 bookdown_0.13 ## [163] RANN_2.6.1 mvtnorm_1.0-11 ## [165] fansi_0.4.0 pspline_1.0-18 ## [167] VIM_4.8.0 R6_2.4.0 ## [169] grid_3.6.0 lifecycle_0.1.0 ## [171] acepack_1.4.1 zip_2.0.4 ## [173] curl_4.2 gdata_2.18.0 ## [175] robustbase_0.93-5 howmany_0.3-1 ## [177] RcppAnnoy_0.0.13 RColorBrewer_1.1-2 ## [179] MCMCglmm_2.29 iterators_1.0.12 ## [181] alphahull_2.2 stringr_1.4.0 ## [183] htmlwidgets_1.3 polyclip_1.10-0 ## [185] purrr_0.3.2 crosstalk_1.0.0 ## [187] mgcv_1.8-28 tensorA_0.36.1 ## [189] htmlTable_1.13.2 clusterExperiment_2.4.4 ## [191] codetools_0.2-16 FNN_1.1.3 ## [193] gtools_3.8.1 prettyunits_1.0.2 ## [195] gridBase_0.4-7 RSpectra_0.15-0 ## [197] R.methodsS3_1.7.1 gtable_0.3.0 ## [199] DBI_1.0.0 highr_0.8 ## [201] tensor_1.5 httr_1.4.1 ## [203] KernSmooth_2.23-15 stringi_1.4.3 ## [205] progress_1.2.2 reshape2_1.4.3 ## [207] uuid_0.1-2 cubature_2.0.3 ## [209] annotate_1.62.0 viridis_0.5.1 ## [211] xml2_1.2.2 combinat_0.0-8 ## [213] bbmle_1.0.20 boot_1.3-20 ## [215] BiocNeighbors_1.2.0 ade4_1.7-13 ## [217] DEoptimR_1.0-8 bit_1.1-14 ## [219] spatstat.data_1.4-0 pkgconfig_2.0.3 ## [221] gsl_2.1-6 knitr_1.25 References "], -["dechapter.html", "12 Differential Expression (DE) analysis 12.1 Introduction to DE analysis 12.2 DE in a real dataset", " 12 Differential Expression (DE) analysis 12.1 Introduction to DE analysis 12.1.1 Bulk RNA-seq One of the most common types of analyses when working with bulk RNA-seq data is to identify differentially expressed genes. By comparing the genes that change between two conditions, e.g. mutant and wild-type or stimulated and unstimulated, it is possible to characterize the molecular mechanisms underlying the change. Several different methods, e.g. DESeq2 and edgeR, have been developed for bulk RNA-seq. Moreover, there are also extensive datasets available where the RNA-seq data has been validated using RT-qPCR. These data can be used to benchmark DE finding algorithms and the available evidence suggests that the algorithms are performing quite well. 12.1.2 Single cell RNA-seq In contrast to bulk RNA-seq, in scRNA-seq we usually do not have a defined set of experimental conditions. Instead, as was shown in a previous chapter (10.2) we can identify the cell groups by using an unsupervised clustering approach. Once the groups have been identified one can find differentially expressed genes either by comparing the differences in variance between the groups (like the Kruskal-Wallis test implemented in SC3), or by comparing gene expression between clusters in a pairwise manner. In the following chapter we will mainly consider tools developed for pairwise comparisons. 12.1.3 Differences in Distribution Unlike bulk RNA-seq, we generally have a large number of samples (i.e. cells) for each group we are comparing in single-cell experiments. Thus we can take advantage of the whole distribution of expression values in each group to identify differences between groups rather than only comparing estimates of mean-expression as is standard for bulk RNASeq. There are two main approaches to comparing distributions. Firstly, we can use existing statistical models/distributions and fit the same type of model to the expression in each group then test for differences in the parameters for each model, or test whether the model fits better if a particular paramter is allowed to be different according to group. For instance in Chapter 7.6 we used edgeR to test whether allowing mean expression to be different in different batches significantly improved the fit of a negative binomial model of the data. Alternatively, we can use a non-parametric test which does not assume that expression values follow any particular distribution, e.g. the Kolmogorov-Smirnov test (KS-test). Non-parametric tests generally convert observed expression values to ranks and test whether the distribution of ranks for one group are signficantly different from the distribution of ranks for the other group. However, some non-parametric methods fail in the presence of a large number of tied values, such as the case for dropouts (zeros) in single-cell RNA-seq expression data. Moreover, if the conditions for a parametric test hold, then it will typically be more powerful than a non-parametric test. 12.1.4 Models of single-cell RNASeq data The most common model of RNASeq data is the negative binomial model: set.seed(1) hist( rnbinom( 1000, mu = 10, size = 100), col = "grey50", xlab = "Read Counts", main = "Negative Binomial" ) Mean: \\(\\mu = mu\\) Variance: \\(\\sigma^2 = mu + mu^2/size\\) It is parameterized by the mean expression (mu) and the dispersion (size), which is inversely related to the variance. The negative binomial model fits bulk RNA-seq data very well and it is used for most statistical methods designed for such data. In addition, it has been show to fit the distribution of molecule counts obtained from data tagged by unique molecular identifiers (UMIs) quite well (Grun et al. 2014, Islam et al. 2011). However, a raw negative binomial model does not fit full-length transcript data as well due to the high dropout rates relative to the non-zero read counts. For this type of data a variety of zero-inflated negative binomial models have been proposed (e.g. MAST, SCDE). d <- 0.5; counts <- rnbinom( 1000, mu = 10, size = 100 ) counts[runif(1000) < d] <- 0 hist( counts, col = "grey50", xlab = "Read Counts", main = "Zero-inflated NB" ) Mean: \\(\\mu = mu \\cdot (1 - d)\\) Variance: \\(\\sigma^2 = \\mu \\cdot (1-d) \\cdot (1 + d \\cdot \\mu + \\mu / size)\\) These models introduce a new parameter \\(d\\), for the dropout rate, to the negative binomial model. As we saw in Chapter 19, the dropout rate of a gene is strongly correlated with the mean expression of the gene. Different zero-inflated negative binomial models use different relationships between mu and d and some may fit \\(\\mu\\) and \\(d\\) to the expression of each gene independently. Finally, several methods use a Poisson-Beta distribution which is based on a mechanistic model of transcriptional bursting. There is strong experimental support for this model (Kim and Marioni, 2013) and it provides a good fit to scRNA-seq data but it is less easy to use than the negative-binomial models and much less existing methods upon which to build than the negative binomial model. a <- 0.1 b <- 0.1 g <- 100 lambdas <- rbeta(1000, a, b) counts <- sapply(g*lambdas, function(l) {rpois(1, lambda = l)}) hist( counts, col = "grey50", xlab = "Read Counts", main = "Poisson-Beta" ) Mean: \\(\\mu = g \\cdot a / (a + b)\\) Variance: \\(\\sigma^2 = g^2 \\cdot a \\cdot b/((a + b + 1) \\cdot (a + b)^2)\\) This model uses three parameters: \\(a\\) the rate of activation of transcription; \\(b\\) the rate of inhibition of transcription; and \\(g\\) the rate of transcript production while transcription is active at the locus. Differential expression methods may test each of the parameters for differences across groups or only one (often \\(g\\)). All of these models may be further expanded to explicitly account for other sources of gene expression differences such as batch-effect or library depth depending on the particular DE algorithm. Exercise: Vary the parameters of each distribution to explore how they affect the distribution of gene expression. How similar are the Poisson-Beta and Negative Binomial models? 12.2 DE in a real dataset library(scRNA.seq.funcs) library(edgeR) library(monocle) library(MAST) library(ROCR) set.seed(1) 12.2.1 Introduction To test different single-cell differential expression methods we will be using the Blischak dataset from Chapters 7-17. For this experiment bulk RNA-seq data for each cell-line was generated in addition to single-cell data. We will use the differentially expressed genes identified using standard methods on the respective bulk data as the ground truth for evaluating the accuracy of each single-cell method. To save time we have pre-computed these for you. You can run the commands below to load these data. DE <- read.table("data/tung/TPs.txt") notDE <- read.table("data/tung/TNs.txt") GroundTruth <- list( DE = as.character(unlist(DE)), notDE = as.character(unlist(notDE)) ) This ground truth has been produce for the comparison of individual NA19101 to NA19239. Now load the respective single-cell data: molecules <- read.table("data/tung/molecules.txt", sep = "\\t") anno <- read.table("data/tung/annotation.txt", sep = "\\t", header = TRUE) keep <- anno[,1] == "NA19101" | anno[,1] == "NA19239" data <- molecules[,keep] group <- anno[keep,1] batch <- anno[keep,4] # remove genes that aren't expressed in at least 6 cells gkeep <- rowSums(data > 0) > 5; counts <- data[gkeep,] # Library size normalization lib_size = colSums(counts) norm <- t(t(counts)/lib_size * median(lib_size)) # Variant of CPM for datasets with library sizes of fewer than 1 mil molecules Now we will compare various single-cell DE methods. We will focus on methods that performed well in Soneson and Robinson’s [2019; CITE] detailed comparison of differential expression methods for single-cell data. Note that we will only be running methods which are available as R-packages and run relatively quickly. 12.2.2 Kolmogorov-Smirnov test The types of test that are easiest to work with are non-parametric ones. The most commonly used non-parametric test is the Kolmogorov-Smirnov test (KS-test) and we can use it to compare the distributions for each gene in the two individuals. The KS-test quantifies the distance between the empirical cummulative distributions of the expression of each gene in each of the two populations. It is sensitive to changes in mean experession and changes in variability. However it assumes data is continuous and may perform poorly when data contains a large number of identical values (eg. zeros). Another issue with the KS-test is that it can be very sensitive for large sample sizes and thus it may end up as significant even though the magnitude of the difference is very small. Now run the test: pVals <- apply( norm, 1, function(x) { ks.test( x[group == "NA19101"], x[group == "NA19239"] )$p.value } ) # multiple testing correction pVals <- p.adjust(pVals, method = "fdr") This code “applies†the function to each row (specified by 1) of the expression matrix, data. In the function we are returning just the p.value from the ks.test output. We can now consider how many of the ground truth positive and negative DE genes are detected by the KS-test: 12.2.2.1 Evaluating Accuracy sigDE <- names(pVals)[pVals < 0.05] length(sigDE) # Number of KS-DE genes sum(GroundTruth$DE %in% sigDE) # Number of KS-DE genes that are true DE genes sum(GroundTruth$notDE %in% sigDE) # Number of KS-DE genes that are truly not-DE As you can see many more of our ground truth negative genes were identified as DE by the KS-test (false positives) than ground truth positive genes (true positives), however this may be due to the larger number of notDE genes thus we typically normalize these counts as the True positive rate (TPR), TP/(TP + FN), and False positive rate (FPR), FP/(FP+TP). tp <- sum(GroundTruth$DE %in% sigDE) fp <- sum(GroundTruth$notDE %in% sigDE) tn <- sum(GroundTruth$notDE %in% names(pVals)[pVals >= 0.05]) fn <- sum(GroundTruth$DE %in% names(pVals)[pVals >= 0.05]) tpr <- tp/(tp + fn) fpr <- fp/(fp + tn) cat(c(tpr, fpr)) Now we can see the TPR is much higher than the FPR indicating the KS test is identifying DE genes. So far we’ve only evaluated the performance at a single significance threshold. Often it is informative to vary the threshold and evaluate performance across a range of values. This is then plotted as a receiver-operating-characteristic curve (ROC) and a general accuracy statistic can be calculated as the area under this curve (AUC). We will use the ROCR package to facilitate this plotting. # Only consider genes for which we know the ground truth pVals <- pVals[names(pVals) %in% GroundTruth$DE | names(pVals) %in% GroundTruth$notDE] truth <- rep(1, times = length(pVals)); truth[names(pVals) %in% GroundTruth$DE] = 0; pred <- ROCR::prediction(pVals, truth) perf <- ROCR::performance(pred, "tpr", "fpr") ROCR::plot(perf) aucObj <- ROCR::performance(pred, "auc") aucObj@y.values[[1]] # AUC Finally to facilitate the comparisons of other DE methods let’s put this code into a function so we don’t need to repeat it: DE_Quality_AUC <- function(pVals) { pVals <- pVals[names(pVals) %in% GroundTruth$DE | names(pVals) %in% GroundTruth$notDE] truth <- rep(1, times = length(pVals)); truth[names(pVals) %in% GroundTruth$DE] = 0; pred <- ROCR::prediction(pVals, truth) perf <- ROCR::performance(pred, "tpr", "fpr") ROCR::plot(perf) aucObj <- ROCR::performance(pred, "auc") return(aucObj@y.values[[1]]) } 12.2.3 Wilcox/Mann-Whitney-U Test The Wilcox-rank-sum test is another non-parametric test, but tests specifically if values in one group are greater/less than the values in the other group. Thus it is often considered a test for difference in median expression between two groups; whereas the KS-test is sensitive to any change in distribution of expression values. pVals <- apply( norm, 1, function(x) { wilcox.test( x[group == "NA19101"], x[group == "NA19239"] )$p.value } ) # multiple testing correction pVals <- p.adjust(pVals, method = "fdr") DE_Quality_AUC(pVals) 12.2.4 edgeR We’ve already used edgeR for differential expression in Chapter 7.6. edgeR is based on a negative binomial model of gene expression and uses a generalized linear model (GLM) framework, the enables us to include other factors such as batch to the model. dge <- DGEList( counts = counts, norm.factors = rep(1, length(counts[1,])), group = group ) group_edgeR <- factor(group) design <- model.matrix(~ group_edgeR) dge <- estimateDisp(dge, design = design, trend.method = "none") fit <- glmFit(dge, design) res <- glmLRT(fit) pVals <- res$table[,4] names(pVals) <- rownames(res$table) pVals <- p.adjust(pVals, method = "fdr") DE_Quality_AUC(pVals) 12.2.5 MAST MAST is based on a zero-inflated negative binomial model. It tests for differential expression using a hurdle model to combine tests of discrete (0 vs not zero) and continuous (non-zero values) aspects of gene expression. Again this uses a linear modelling framework to enable complex models to be considered. log_counts <- log(counts + 1) / log(2) fData <- data.frame(names = rownames(log_counts)) rownames(fData) <- rownames(log_counts); cData <- data.frame(cond = group) rownames(cData) <- colnames(log_counts) obj <- FromMatrix(as.matrix(log_counts), cData, fData) colData(obj)$cngeneson <- scale(colSums(assay(obj) > 0)) cond <- factor(colData(obj)$cond) # Model expression as function of condition & number of detected genes zlmCond <- zlm.SingleCellAssay(~ cond + cngeneson, obj) summaryCond <- summary(zlmCond, doLRT = "condNA19101") summaryDt <- summaryCond$datatable summaryDt <- as.data.frame(summaryDt) pVals <- unlist(summaryDt[summaryDt$component == "H",4]) # H = hurdle model names(pVals) <- unlist(summaryDt[summaryDt$component == "H",1]) pVals <- p.adjust(pVals, method = "fdr") DE_Quality_AUC(pVals) 12.2.6 limma 12.2.7 Pseudobulk 12.2.8 sessionInfo() "], +["introduction-to-rbioconductor.html", "3 Introduction to R/Bioconductor 3.1 Installing packages 3.2 Installation instructions: 3.3 Data-types/classes 3.4 Basic data structures 3.5 Accessing documentation and help files 3.6 Data Types 3.7 What is Bioconductor? 3.8 SingleCellExperiment class 3.9 scater package 3.10 Introduction to ggplot2", " 3 Introduction to R/Bioconductor 3.1 Installing packages 3.1.1 CRAN The Comprehensive R Archive Network CRAN is the biggest archive of R packages. There are few requirements for uploading packages besides building and installing succesfully, hence documentation and support is often minimal and figuring how to use these packages can be a challenge it itself. CRAN is the default repository R will search to find packages to install: install.packages("devtools") require("devtools") 3.1.2 Github Github isn’t specific to R, any code of any type in any state can be uploaded. There is no guarantee a package uploaded to github will even install, nevermind do what it claims to do. R packages can be downloaded and installed directly from github using the “devtools†package installed above. devtools::install_github("tallulandrews/M3Drop") Github is also a version control system which stores multiple versions of any package. By default the most recent “master†version of the package is installed. If you want an older version or the development branch this can be specified using the “ref†parameter: # different branch devtools::install_github("tallulandrews/M3D", ref="nbumi") # previous commit devtools::install_github("tallulandrews/M3Drop", ref="434d2da28254acc8de4940c1dc3907ac72973135") Note: make sure you re-install the M3Drop master branch for later in the course. 3.1.3 Bioconductor Bioconductor is a repository of R-packages specifically for biological analyses. It has the strictest requirements for submission, including installation on every platform and full documentation with a tutorial (called a vignette) explaining how the package should be used. Bioconductor also encourages utilization of standard data structures/classes and coding style/naming conventions, so that, in theory, packages and analyses can be combined into large pipelines or workflows. source("https://bioconductor.org/biocLite.R") biocLite("edgeR") Note: in some situations it is necessary to substitute “http://†for “https://†in the above depending on the security features of your internet connection/network. Bioconductor also requires creators to support their packages and has a regular 6-month release schedule. Make sure you are using the most recent release of bioconductor before trying to install packages for the course. source("https://bioconductor.org/biocLite.R") biocLite("BiocUpgrade") 3.1.4 Source The final way to install packages is directly from source. In this case you have to download a fully built source code file, usually packagename.tar.gz, or clone the github repository and rebuild the package yourself. Generally this will only be done if you want to edit a package yourself, or if for some reason the former methods have failed. install.packages("M3Drop_3.05.00.tar.gz", type="source") 3.2 Installation instructions: All the packages necessary for this course are available here. Starting from “RUN Rscript -eâ€install.packages(‘devtools’)" “, run each of the commands (minusâ€RUN") on the command line or start an R session and run each of the commands within the quotation marks. Note the ordering of the installation is important in some cases, so make sure you run them in order from top to bottom. 3.3 Data-types/classes R is a high level language so the underlying data-type is generally not important. The exception if you are accessing R data directly using another language such as C, but that is beyond the scope of this course. Instead we will consider the basic data classes: numeric, integer, logical, and character, and the higher level data class called “factorâ€. You can check what class your data is using the “class()†function. Aside: R can also store data as “complex†for complex numbers but generally this isn’t relevant for biological analyses. 3.3.1 Numeric The “numeric†class is the default class for storing any numeric data - integers, decimal numbers, numbers in scientific notation, etc… x = 1.141 class(x) ## [1] "numeric" y = 42 class(y) ## [1] "numeric" z = 6.02e23 class(z) ## [1] "numeric" Here we see that even though R has an “integer†class and 42 could be stored more efficiently as an integer the default is to store it as “numericâ€. If we want 42 to be stored as an integer we must “coerce†it to that class: y = as.integer(42) class(y) ## [1] "integer" Coercion will force R to store data as a particular class, if our data is incompatible with that class it will still do it but the data will be converted to NAs: as.numeric("H") ## Warning: NAs introduced by coercion ## [1] NA Above we tried to coerce “character†data, identified by the double quotation marks, into numeric data which doesn’t make sense, so we triggered (“threwâ€) an warning message. Since this is only a warning R would continue with any subsequent commands in a script/function, whereas an “error†would cause R to halt. 3.3.2 Character/String The “character†class stores all kinds of text data. Programing convention calls data containing multiple letters a “stringâ€, thus most R functions which act on character data will refer to the data as “strings†and will often have “str†or “string†in it’s name. Strings are identified by being flanked by double quotation marks, whereas variable/function names are not: x = 5 a = "x" # character "x" a ## [1] "x" b = x # variable x b ## [1] 5 In addition to standard alphanumeric characters, strings can also store various special characters. Special characters are identified using a backlash followed by a single character, the most relevant are the special character for tab : \\t and new line : \\n. To demonstrate the these special characters lets concatenate (cat) together two strings with these characters separating (sep) them: cat("Hello", "World", sep= " ") ## Hello World cat("Hello", "World", sep= "\\t") ## Hello World cat("Hello", "World", sep= "\\n") ## Hello ## World Note that special characters work differently in different functions. For instance the paste function does the same thing as cat but does not recognize special characters. paste("Hello", "World", sep= " ") ## [1] "Hello World" paste("Hello", "World", sep= "\\t") ## [1] "Hello\\tWorld" paste("Hello", "World", sep= "\\n") ## [1] "Hello\\nWorld" Single or double backslash is also used as an escape character to turn off special characters or allow quotation marks to be included in strings: cat("This \\"string\\" contains quotation marks.") ## This "string" contains quotation marks. Special characters are generally only used in pattern matching, and reading/writing data to files. For instance this is how you would read a tab-separated file into R. dat = read.delim("file.tsv", sep="\\t") Another special type of character data are colours. Colours can be specified in three main ways: by name from those available, by red, green, blue values using the rgb function, and by hue (colour), saturation (colour vs white) and value (colour/white vs black) using the hsv function. By default rgb and hsv expect three values in 0-1 with an optional fourth value for transparency. Alternatively, sets of predetermined colours with useful properties can be loaded from many different packages with RColorBrewer being one of the most popular. reds = c("red", rgb(1,0,0), hsv(0, 1, 1)) reds ## [1] "red" "#FF0000" "#FF0000" barplot(c(1,1,1), col=reds, names=c("by_name", "by_rgb", "by_hsv")) 3.3.3 Logical The logical class stores boolean truth values, i.e. TRUE and FALSE. It is used for storing the results of logical operations and conditional statements will be coerced to this class. Most other data-types can be coerced to boolean without triggering (or “throwingâ€) error messages, which may cause unexpected behaviour. x = TRUE class(x) ## [1] "logical" y = "T" as.logical(y) ## [1] TRUE z = 5 as.logical(z) ## [1] TRUE x = FALSE class(x) ## [1] "logical" y = "F" as.logical(y) ## [1] FALSE z = 0 as.logical(z) ## [1] FALSE Exercise 1 Experiment with other character and numeric values, which are coerced to TRUE or FALSE? which are coerced to neither? Do you ever throw a warning/error message? 3.3.4 Factors String/Character data is very memory inefficient to store, each letter generally requires the same amount of memory as any integer. Thus when storing a vector of strings with repeated elements it is more efficient assign each element to an integer and store the vector as integers and an additional string-to-integer association table. Thus, by default R will read in text columns of a data table as factors. str_vector = c("Apple", "Apple", "Banana", "Banana", "Banana", "Carrot", "Carrot", "Apple", "Banana") factored_vector = factor(str_vector) factored_vector ## [1] Apple Apple Banana Banana Banana Carrot Carrot Apple Banana ## Levels: Apple Banana Carrot as.numeric(factored_vector) ## [1] 1 1 2 2 2 3 3 1 2 The double nature of factors can cause some unintuitive behaviour. E.g. joining two factors together will convert them to the numeric form and the original strings will be lost. c(factored_vector, factored_vector) ## [1] 1 1 2 2 2 3 3 1 2 1 1 2 2 2 3 3 1 2 Likewise if due to formatting issues numeric data is mistakenly interpretted as strings, then you must convert the factor back to strings before coercing to numeric values: x = c("20", "25", "23", "38", "20", "40", "25", "30") x = factor(x) as.numeric(x) ## [1] 1 3 2 5 1 6 3 4 as.numeric(as.character(x)) ## [1] 20 25 23 38 20 40 25 30 To make R read text as character data instead of factors set the environment option stringsAsFactors=FALSE. This must be done at the start of each R session. options(stringsAsFactors=FALSE) Exercise How would you use factors to create a vector of colours for an arbitrarily long vector of fruits like str_vector above? Answer 3.3.5 Checking class/type We recommend checking your data is of the correct class after reading from files: x = 1.4 is.numeric(x) ## [1] TRUE is.character(x) ## [1] FALSE is.logical(x) ## [1] FALSE is.factor(x) ## [1] FALSE 3.4 Basic data structures So far we have only looked at single values and vectors. Vectors are the simplest data structure in R. They are a 1-dimensional array of data all of the same type. If the input when creating a vector is of different types it will be coerced to the data-type that is most consistent with the data. x = c("Hello", 5, TRUE) x ## [1] "Hello" "5" "TRUE" class(x) ## [1] "character" Here we tried to put character, numeric and logical data into a single vector so all the values were coerced to character data. A matrix is the two dimensional version of a vector, it also requires all data to be of the same type. If we combine a character vector and a numeric vector into a matrix, all the data will be coerced to characters: x = c("A", "B", "C") y = c(1, 2, 3) class(x) ## [1] "character" class(y) ## [1] "numeric" m = cbind(x, y) m ## x y ## [1,] "A" "1" ## [2,] "B" "2" ## [3,] "C" "3" The quotation marks indicate that the numeric vector has been coerced to characters. Alternatively, to store data with columns of different data-types we can use a dataframe. z = data.frame(x, y) z ## x y ## 1 A 1 ## 2 B 2 ## 3 C 3 class(z[,1]) ## [1] "character" class(z[,2]) ## [1] "numeric" If you have set stringsAsFactors=FALSE as above you will find the first column remains characters, otherwise it will be automatically converted to a factor. options(stringsAsFactors=TRUE) z = data.frame(x, y) class(z[,1]) ## [1] "factor" Another difference between matrices and dataframes is the ability to select columns using the $ operator: m$x # throws an error z$x # ok The final basic data structure is the list. Lists allow data of different types and different lengths to be stored in a single object. Each element of a list can be any other R object : data of any type, any data structure, even other lists or functions. l = list(m, z) ll = list(sublist=l, a_matrix=m, numeric_value=42, this_string="Hello World", even_a_function=cbind) ll ## $sublist ## $sublist[[1]] ## x y ## [1,] "A" "1" ## [2,] "B" "2" ## [3,] "C" "3" ## ## $sublist[[2]] ## x y ## 1 A 1 ## 2 B 2 ## 3 C 3 ## ## ## $a_matrix ## x y ## [1,] "A" "1" ## [2,] "B" "2" ## [3,] "C" "3" ## ## $numeric_value ## [1] 42 ## ## $this_string ## [1] "Hello World" ## ## $even_a_function ## function (..., deparse.level = 1) ## .Internal(cbind(deparse.level, ...)) ## <bytecode: 0x5600bf7f70f8> ## <environment: namespace:base> Lists are most commonly used when returning a large number of results from a function that do not fit into any of the previous data structures. 3.5 Accessing documentation and help files You can get more information about any R commands relevant to these datatypes using by typing ?function in an interactive session. 3.6 Data Types 3.6.1 What is Tidy Data? Tidy data is a concept largely defined by Hadley Wickham (Wickham 2014). Tidy data has the following three characteristics: Each variable has its own column. Each observation has its own row. Each value has its own cell. Here is an example of some tidy data: ## Students Subject Years Score ## 1 Mark Maths 1 5 ## 2 Jane Biology 2 6 ## 3 Mohammed Physics 3 4 ## 4 Tom Maths 2 7 ## 5 Celia Computing 3 9 Here is an example of some untidy data: ## Students Sport Category Counts ## 1 Matt Tennis Wins 0 ## 2 Matt Tennis Losses 1 ## 3 Ellie Rugby Wins 3 ## 4 Ellie Rugby Losses 2 ## 5 Tim Football Wins 1 ## 6 Tim Football Losses 4 ## 7 Louise Swimming Wins 2 ## 8 Louise Swimming Losses 2 ## 9 Kelly Running Wins 5 ## 10 Kelly Running Losses 1 Task 1: In what ways is the untidy data not tidy? How could we make the untidy data tidy? Tidy data is generally easier to work with than untidy data, especially if you are working with packages such as ggplot. Fortunately, packages are available to make untidy data tidy. Today we will explore a few of the functions available in the tidyr package which can be used to make untidy data tidy. If you are interested in finding out more about tidying data, we recommend reading “R for Data Scienceâ€, by Garrett Grolemund and Hadley Wickham. An electronic copy is available here: http://r4ds.had.co.nz/ The untidy data above is untidy because two variables (Wins and Losses) are stored in one column (Category). This is a common way in which data can be untidy. To tidy this data, we need to make Wins and Losses into columns, and store the values in Counts in these columns. Fortunately, there is a function from the tidyverse packages to perform this operation. The function is called spread, and it takes two arguments, key and value. You should pass the name of the column which contains multiple variables to key, and pass the name of the column which contains values from multiple variables to value. For example: library(tidyverse) sports<-data.frame(Students=c("Matt", "Matt", "Ellie", "Ellie", "Tim", "Tim", "Louise", "Louise", "Kelly", "Kelly"), Sport=c("Tennis","Tennis", "Rugby", "Rugby","Football", "Football","Swimming","Swimming", "Running", "Running"), Category=c("Wins", "Losses", "Wins", "Losses", "Wins", "Losses", "Wins", "Losses", "Wins", "Losses"), Counts=c(0,1,3,2,1,4,2,2,5,1)) sports ## Students Sport Category Counts ## 1 Matt Tennis Wins 0 ## 2 Matt Tennis Losses 1 ## 3 Ellie Rugby Wins 3 ## 4 Ellie Rugby Losses 2 ## 5 Tim Football Wins 1 ## 6 Tim Football Losses 4 ## 7 Louise Swimming Wins 2 ## 8 Louise Swimming Losses 2 ## 9 Kelly Running Wins 5 ## 10 Kelly Running Losses 1 spread(sports, key=Category, value=Counts) ## Students Sport Losses Wins ## 1 Ellie Rugby 2 3 ## 2 Kelly Running 1 5 ## 3 Louise Swimming 2 2 ## 4 Matt Tennis 1 0 ## 5 Tim Football 4 1 Task 2: The dataframe foods defined below is untidy. Work out why and use spread() to tidy it foods<-data.frame(student=c("Antoinette","Antoinette","Taylor", "Taylor", "Alexa", "Alexa"), Category=c("Dinner", "Dessert", "Dinner", "Dessert", "Dinner","Dessert"), Frequency=c(3,1,4,5,2,1)) The other common way in which data can be untidy is if the columns are values instead of variables. For example, the dataframe below shows the percentages some students got in tests they did in May and June. The data is untidy because the columns May and June are values, not variables. percentages<-data.frame(student=c("Alejandro", "Pietro", "Jane"), "May"=c(90,12,45), "June"=c(80,30,100)) Fortunately, there is a function in the tidyverse packages to deal with this problem too. gather() takes the names of the columns which are values, the key and the value as arguments. This time, the key is the name of the variable with values as column names, and the value is the name of the variable with values spread over multiple columns. Ie: gather(percentages, "May", "June", key="Month", value = "Percentage") ## student Month Percentage ## 1 Alejandro May 90 ## 2 Pietro May 12 ## 3 Jane May 45 ## 4 Alejandro June 80 ## 5 Pietro June 30 ## 6 Jane June 100 These examples don’t have much to do with single-cell RNA-seq analysis, but are designed to help illustrate the features of tidy and untidy data. You will find it much easier to analyse your single-cell RNA-seq data if your data is stored in a tidy format. Fortunately, the data structures we commonly use to facilitate single-cell RNA-seq analysis usually encourage store your data in a tidy manner. 3.6.2 What is Rich Data? If you google ‘rich data’, you will find lots of different definitions for this term. In this course, we will use ‘rich data’ to mean data which is generated by combining information from multiple sources. For example, you could make rich data by creating an object in R which contains a matrix of gene expression values across the cells in your single-cell RNA-seq experiment, but also information about how the experiment was performed. Objects of the SingleCellExperiment class, which we will discuss below, are an example of rich data. Typically, Bioconductor packages make use of rich data objects that have many advantages for package developers and users alike. 3.7 What is Bioconductor? From Wikipedia: Bioconductor is a free, open source and open development software project for the analysis and comprehension of genomic data generated by wet lab experiments in molecular biology. Bioconductor is based primarily on the statistical R programming language, but does contain contributions in other programming languages. It has two releases each year that follow the semiannual releases of R. At any one time there is a release version,which corresponds to the released version of R, and a development version, which corresponds to the development version of R. Most users will find the release version appropriate for their needs. We strongly recommend all new comers and even experienced high-throughput data analysts to use well developed and maintained Bioconductor methods and classes. 3.8 SingleCellExperiment class SingleCellExperiment (SCE) is a S4 class for storing data from single-cell experiments. This includes specialized methods to store and retrieve spike-in information, dimensionality reduction coordinates and size factors for each cell, along with the usual metadata for genes and libraries. In practice, an object of this class can be created using its constructor: library(SingleCellExperiment) counts <- matrix(rpois(100, lambda = 10), ncol=10, nrow=10) rownames(counts) <- paste("gene", 1:10, sep = "") colnames(counts) <- paste("cell", 1:10, sep = "") sce <- SingleCellExperiment( assays = list(counts = counts), rowData = data.frame(gene_names = paste("gene_name", 1:10, sep = "")), colData = data.frame(cell_names = paste("cell_name", 1:10, sep = "")) ) sce ## class: SingleCellExperiment ## dim: 10 10 ## metadata(0): ## assays(1): counts ## rownames(10): gene1 gene2 ... gene9 gene10 ## rowData names(1): gene_names ## colnames(10): cell1 cell2 ... cell9 cell10 ## colData names(1): cell_names ## reducedDimNames(0): ## spikeNames(0): In the SingleCellExperiment, users can assign arbitrary names to entries of assays. To assist interoperability between packages, some suggestions for what the names should be for particular types of data are provided by the authors: counts: Raw count data, e.g., number of reads or transcripts for a particular gene. normcounts: Normalized values on the same scale as the original counts. For example, counts divided by cell-specific size factors that are centred at unity. logcounts: Log-transformed counts or count-like values. In most cases, this will be defined as log-transformed normcounts, e.g., using log base 2 and a pseudo-count of 1. cpm: Counts-per-million. This is the read count for each gene in each cell, divided by the library size of each cell in millions. tpm: Transcripts-per-million. This is the number of transcripts for each gene in each cell, divided by the total number of transcripts in that cell (in millions). Each of these suggested names has an appropriate getter/setter method for convenient manipulation of the SingleCellExperiment. For example, we can take the (very specifically named) counts slot, normalise it and assign it to normcounts instead: normcounts(sce) <- log2(counts(sce) + 1) sce ## class: SingleCellExperiment ## dim: 10 10 ## metadata(0): ## assays(2): counts normcounts ## rownames(10): gene1 gene2 ... gene9 gene10 ## rowData names(1): gene_names ## colnames(10): cell1 cell2 ... cell9 cell10 ## colData names(1): cell_names ## reducedDimNames(0): ## spikeNames(0): dim(normcounts(sce)) ## [1] 10 10 head(normcounts(sce)) ## cell1 cell2 cell3 cell4 cell5 cell6 cell7 ## gene1 3.169925 3.169925 2.000000 2.584963 2.584963 3.321928 3.584963 ## gene2 3.459432 1.584963 3.584963 3.807355 3.700440 3.700440 3.000000 ## gene3 3.000000 3.169925 3.807355 3.169925 3.321928 3.321928 3.321928 ## gene4 3.584963 3.459432 3.000000 3.807355 3.700440 3.700440 3.700440 ## gene5 3.906891 3.000000 3.169925 3.321928 3.584963 3.459432 3.807355 ## gene6 3.700440 3.700440 3.584963 4.000000 3.169925 3.000000 3.459432 ## cell8 cell9 cell10 ## gene1 3.321928 3.807355 2.807355 ## gene2 3.807355 3.700440 4.000000 ## gene3 2.584963 4.000000 3.700440 ## gene4 3.169925 3.584963 3.700440 ## gene5 3.807355 2.584963 3.584963 ## gene6 3.321928 3.459432 4.000000 3.9 scater package scater is a R package for single-cell RNA-seq analysis (McCarthy et al. 2017). The package contains several useful methods for quality control, visualisation and pre-processing of data prior to further downstream analysis. scater features the following functionality: Automated computation of QC metrics Transcript quantification from read data with pseudo-alignment Data format standardisation Rich visualizations for exploratory analysis Seamless integration into the Bioconductor universe Simple normalisation methods We highly recommend to use scater for all single-cell RNA-seq analyses and scater is the basis of the first part of the course. As illustrated in the figure below, scater will help you with quality control, filtering and normalization of your expression matrix following mapping and alignment. Keep in mind that this figure represents the original version of scater where an SCESet class was used. In the newest version this figure is still correct, except that SCESet can be substituted with the SingleCellExperiment class. 3.10 Introduction to ggplot2 3.10.1 What is ggplot2? ggplot2 is an R package designed by Hadley Wickham which facilitates data plotting. In this lab, we will touch briefly on some of the features of the package. If you would like to learn more about how to use ggplot2, we would recommend reading “ggplot2 Elegant graphics for data analysisâ€, by Hadley Wickham. 3.10.2 Principles of ggplot2 Your data must be a dataframe if you want to plot it using ggplot2. Use the aes mapping function to specify how variables in the dataframe map to features on your plot Use geoms to specify how your data should be represented on your graph eg. as a scatterplot, a barplot, a boxplot etc. 3.10.3 Using the aes mapping function The aes function specifies how variables in your dataframe map to features on your plot. To understand how this works, let’s look at an example: library(ggplot2) library(tidyverse) set.seed(1) counts <- as.data.frame(matrix(rpois(100, lambda = 10), ncol=10, nrow=10)) Gene_ids <- paste("gene", 1:10, sep = "") colnames(counts) <- paste("cell", 1:10, sep = "") counts<-data.frame(Gene_ids, counts) counts ## Gene_ids cell1 cell2 cell3 cell4 cell5 cell6 cell7 cell8 cell9 cell10 ## 1 gene1 8 8 3 5 5 9 11 9 13 6 ## 2 gene2 10 2 11 13 12 12 7 13 12 15 ## 3 gene3 7 8 13 8 9 9 9 5 15 12 ## 4 gene4 11 10 7 13 12 12 12 8 11 12 ## 5 gene5 14 7 8 9 11 10 13 13 5 11 ## 6 gene6 12 12 11 15 8 7 10 9 10 15 ## 7 gene7 11 11 14 11 11 5 9 13 13 7 ## 8 gene8 9 12 9 8 6 14 7 12 12 10 ## 9 gene9 14 12 11 7 10 10 8 14 7 10 ## 10 gene10 11 10 9 7 11 16 8 7 7 4 ggplot(data = counts, mapping = aes(x = cell1, y = cell2)) Let’s take a closer look at the final command, ggplot(data = counts, mapping = aes(x = cell1, y = cell2)). ggplot() initialises a ggplot object and takes the arguments data and mapping. We pass our dataframe of counts to data and use the aes() function to specify that we would like to use the variable cell1 as our x variable and the variable cell2 as our y variable. Task 1: Modify the command above to initialise a ggplot object where cell10 is the x variable and cell8 is the y variable. Clearly, the plots we have just created are not very informative because no data is displayed on them. To display data, we will need to use geoms. 3.10.4 Geoms We can use geoms to specify how we would like data to be displayed on our graphs. For example, our choice of geom could specify that we would like our data to be displayed as a scatterplot, a barplot or a boxplot. Let’s see how our graph would look as a scatterplot. ggplot(data = counts, mapping = aes(x = cell1, y = cell2)) + geom_point() Now we can see that there doesn’t seem to be any correlation between gene expression in cell1 and cell2. Given we generated counts randomly, this isn’t too surprising. Task 2: Modify the command above to create a line plot. Hint: execute ?ggplot and scroll down the help page. At the bottom is a link to the ggplot package index. Scroll through the index until you find the geom options. 3.10.5 Plotting data from more than 2 cells So far we’ve been considering the gene counts from 2 of the cells in our dataframe. But there are actually 10 cells in our dataframe and it would be nice to compare all of them. What if we wanted to plot data from all 10 cells at the same time? At the moment we can’t do this because we are treating each individual cell as a variable and assigning that variable to either the x or the y axis. We could create a 10 dimensional graph to plot data from all 10 cells on, but this is a) not possible to do with ggplot and b) not very easy to interpret. What we could do instead is to tidy our data so that we had one variable representing cell ID and another variable representing gene counts, and plot those against each other. In code, this would look like: counts<-gather(counts, colnames(counts)[2:11], key = 'Cell_ID', value='Counts') head(counts) ## Gene_ids Cell_ID Counts ## 1 gene1 cell1 8 ## 2 gene2 cell1 10 ## 3 gene3 cell1 7 ## 4 gene4 cell1 11 ## 5 gene5 cell1 14 ## 6 gene6 cell1 12 Essentially, the problem before was that our data was not tidy because one variable (Cell_ID) was spread over multiple columns. Now that we’ve fixed this problem, it is much easier for us to plot data from all 10 cells on one graph. ggplot(counts,aes(x=Cell_ID, y=Counts)) + geom_boxplot() Task 3: Use the updated counts dataframe to plot a barplot with Cell_ID as the x variable and Counts as the y variable. Hint: you may find it helpful to read ?geom_bar. Task 4: Use the updated counts dataframe to plot a scatterplot with Gene_ids as the x variable and Counts as the y variable. 3.10.6 Plotting heatmaps A common method for visualising gene expression data is with a heatmap. Here we will use the R package pheatmap to perform this analysis with some gene expression data we will name test. library(pheatmap) set.seed(2) test = matrix(rnorm(200), 20, 10) test[1:10, seq(1, 10, 2)] = test[1:10, seq(1, 10, 2)] + 3 test[11:20, seq(2, 10, 2)] = test[11:20, seq(2, 10, 2)] + 2 test[15:20, seq(2, 10, 2)] = test[15:20, seq(2, 10, 2)] + 4 colnames(test) = paste("Cell", 1:10, sep = "") rownames(test) = paste("Gene", 1:20, sep = "") pheatmap(test) Let’s take a moment to work out what this graphic is showing us. Each row represents a gene and each column represents a cell. How highly expressed each gene is in each cell is represented by the colour of the corresponding box. For example, we can tell from this plot that gene18 is highly expressed in cell10 but lowly expressed in cell1. This plot also gives us information on the results of a clustering algorithm. In general, clustering algorithms aim to split datapoints (eg.cells) into groups whose members are more alike one another than they are alike the rest of the datapoints. The trees drawn on the top and left hand sides of the graph are the results of clustering algorithms and enable us to see, for example, that cells 4,8,2,6 and 10 are more alike one another than they are alike cells 7,3,5,1 and 9. The tree on the left hand side of the graph represents the results of a clustering algorithm applied to the genes in our dataset. If we look closely at the trees, we can see that eventually they have the same number of branches as there are cells and genes. In other words, the total number of cell clusters is the same as the total number of cells, and the total number of gene clusters is the same as the total number of genes. Clearly, this is not very informative, and will become impractical when we are looking at more than 10 cells and 20 genes. Fortunately, we can set the number of clusters we see on the plot. Let’s try setting the number of gene clusters to 2: pheatmap(test, kmeans_k = 2) Now we can see that the genes fall into two clusters - a cluster of 8 genes which are upregulated in cells 2, 10, 6, 4 and 8 relative to the other cells and a cluster of 12 genes which are downregulated in cells 2, 10, 6, 4 and 8 relative to the other cells. Task 5: Try setting the number of clusters to 3. Which number of clusters do you think is more informative? 3.10.7 Principal Component Analysis Principal component analysis (PCA) is a statistical procedure that uses a transformation to convert a set of observations into a set of values of linearly uncorrelated variables called principal components. The transformation is carried out so that the first principle component accounts for as much of the variability in the data as possible, and each following principle component accounts for the greatest amount of variance possible under the contraint that it must be orthogonal to the previous components. PCA plots are a good way to get an overview of your data, and can sometimes help identify confounders which explain a high amount of the variability in your data. We will investigate how we can use PCA plots in single-cell RNA-seq analysis in more depth in a future lab, here the aim is to give you an overview of what PCA plots are and how they are generated. Let’s make a PCA plot for our test data. We can use the ggfortify package to let ggplot know how to interpret principle components. library(ggfortify) Principal_Components<-prcomp(test) autoplot(Principal_Components, label=TRUE) Task 6: Compare your clusters to the pheatmap clusters. Are they related? (Hint: have a look at the gene tree for the first pheatmap we plotted) Task 7: Produce a heatmap and PCA plot for counts (below): set.seed(1) counts <- as.data.frame(matrix(rpois(100, lambda = 10), ncol=10, nrow=10)) rownames(counts) <- paste("gene", 1:10, sep = "") colnames(counts) <- paste("cell", 1:10, sep = "") References "], +["datasets.html", "4 Datasets 4.1 Deng 4.2 Tung 4.3 Pancreas 4.4 Heart 4.5 Thymus 4.6 Tabula Muris 4.7 Introduction 4.8 Downloading the data 4.9 Reading the data (Smartseq2) 4.10 Building a SingleCellExperiment object 4.11 Reading the data (10X) 4.12 Building a SingleCellExperiment object for the 10X data 4.13 Advanced Exercise", " 4 Datasets Here we provide brief descriptions of the core datasets used in this course and a more detailed description of the Tabula Muris (mouse cell atlas) data, how it can be downloaded and how it can be used. 4.1 Deng A single-cell RNA-seq dataset of 268 individual cells dissociated from in vivo F1 embryos from oocyte to blastocyst stages of mouse preimplantation development. Single-cell transcriptome profiles were generated with Smart-seq or Smart-seq2 from each individual cell with spike-ins (NB: both the Smart-seq and Smart-seq2 protocols were used, for different sets of cells in the dataset). Cells annlysed here have been annotated with their developmental stages according to the original publication. Deng, Qiaolin, et al. “Single-cell RNA-seq reveals dynamic, random monoallelic gene expression in mammalian cells.†Science 343.6167 (2014) 193-196. 4.2 Tung A dataset of induced pluripotent stem cells generated from three different individuals with replicates (Tung et al. 2017) in Yoav Gilad’s lab at the University of Chicago. Data generated using Fluidigm C1 platform and to facilitate the quantification both unique molecular identifiers (UMIs) and ERCC spike-ins were used. The data files are located in the tung folder in your working directory. These files are the copies of the original files made on the 15/03/16. We will use these copies for reproducibility purposes. Tung, Po-Yuan, et al. “Batch effects and the effective design of single-cell gene expression studies.†Scientific reports 7 (2017): 39921. 4.3 Pancreas We have included two human pancreas datasets: from Muraro et al (2016) (Muraro et al. 2016) and Segerstolpe et al. (2016) (Segerstolpe et al. 2016). Since the pancreas has been widely studied, these datasets are well annotated. 4.3.1 Muraro Single-cell CEL-seq2 data were generated using a customised automated platform that uses FACS, robotics, and the CEL-Seq2 protocol to obtain the transcriptomes of thousands of single pancreatic cells from four deceased organ donors. Cell surface markers can be used for sorting and enriching certain cell types.(Muraro et al. 2016) Muraro,M.J. et al. (2016) A Single-Cell Transcriptome Atlas of the Human Pancreas. Cell Syst, 3, 385–394.e3. 4.3.2 Segerstolpe Single-cell RNA-seq dataset of human pancreatic cells from patients with type 2 diabetes and healthy controls. Single cells were prepared using Smart-seq2 protocol and sequenced on an Illumina HiSeq 2000.(Segerstolpe et al. 2016) Segerstolpe,Ã…. et al. (2016) Single-Cell Transcriptome Profiling of Human Pancreatic Islets in Health and Type 2 Diabetes. Cell Metab., 24, 593–607. 4.4 Heart data/sce/Heart_10X.rds is a SCE object containing cells from Heart tissue from the Tabula Muris dataset (details below) using 10X protocol. 4.5 Thymus data/sce/Thymus_10X.rds is a SCE object containing cells from Thymus tissue from the Tabula Muris dataset (details below) using 10X protocol. 4.6 Tabula Muris 4.7 Introduction To give you hands-on experience analyzing from start to finish a single-cell RNASeq dataset we will be using as an example, data from the Tabula Muris initial release. The Tabula Muris is an international collaboration with the aim to profile every cell-type in the mouse using a standardized method. They combine high-throughput but low-coverage 10X data with lower throughput but high-coverage FACS-sorted cells + Smartseq2. The initial release of the data (20 Dec 2017), contains almost 100,000 cells across 20 different tissues/organs. You might like to choose a tissue to focus on for a detailed analysis. 4.8 Downloading the data Unlike most single-cell RNA-seq data Tabula Muris has released their data through the figshare platform rather than uploading it to GEO or ArrayExpress. You can find the data by using the doi’s in their paper : 10.6084/m9.figshare.5715040 for FACS/Smartseq2 and 10.6084/m9.figshare.5715025 for 10X data. The data can be downloaded manually by clinking the doi links or by using the command-line commands below: Terminal-based download of FACS data: wget https://ndownloader.figshare.com/files/10038307 unzip 10038307 wget https://ndownloader.figshare.com/files/10038310 mv 10038310 FACS_metadata.csv wget https://ndownloader.figshare.com/files/10039267 mv 10039267 FACS_annotations.csv Terminal-based download of 10X data: wget https://ndownloader.figshare.com/files/10038325 unzip 10038325 wget https://ndownloader.figshare.com/files/10038328 mv 10038328 droplet_metadata.csv wget https://ndownloader.figshare.com/files/10039264 mv 10039264 droplet_annotation.csv Note if you download the data by hand you should unzip & rename the files as above before continuing. You should now have two folders : “FACS†and “droplet†and one annotation and metadata file for each. To inspect these files you can use the head to see the top few lines of the text files (Press “q†to exit): head -n 10 droplet_metadata.csv You can also check the number of rows in each file using: wc -l droplet_annotation.csv Exercise How many cells do we have annotations for from FACS? from 10X? nn Answer FACS : 54,838 cells Droplet : 42,193 cells 4.9 Reading the data (Smartseq2) We can now read in the relevant count matrix from the comma-separated file. Then inspect the resulting dataframe: dat <- read.delim("FACS/Kidney-counts.csv", sep=",", header=TRUE) dat[1:5,1:5] We can see that the first column in the dataframe is the gene names, so first we move these to the rownames so we have a numeric matrix: dim(dat) rownames(dat) <- dat[,1] dat <- dat[,-1] Since this is a Smart-seq2 dataset it may contain spike-ins so lets check: rownames(dat)[grep("^ERCC-", rownames(dat))] Now we can extract much of the metadata for this data from the column names: cellIDs <- colnames(dat) cell_info <- strsplit(cellIDs, "\\\\.") Well <- lapply(cell_info, function(x){x[1]}) Well <- unlist(Well) Plate <- unlist(lapply(cell_info, function(x){x[2]})) Mouse <- unlist(lapply(cell_info, function(x){x[3]})) We can check the distributions of each of these metadata classifications: summary(factor(Mouse)) We can also check if any technical factors are confounded: table(Mouse, Plate) Lastly we will read the computationally inferred cell-type annotation and match them to the cell in our expression matrix: ann <- read.table("FACS_annotations.csv", sep=",", header=TRUE) ann <- ann[match(cellIDs, ann[,1]),] celltype <- ann[,3] 4.10 Building a SingleCellExperiment object To create a SingleCellExperiment object we must put together all the cell annotations into a single dataframe, since the experimental batch (PCR plate) is completely confounded with donor mouse we will only keep one of them. library("SingleCellExperiment") library("scater") cell_anns <- data.frame(mouse = Mouse, well=Well, type=celltype) rownames(cell_anns) <- colnames(dat) sceset <- SingleCellExperiment(assays = list(counts = as.matrix(dat)), colData=cell_anns) Finally if the dataset contains spike-ins we a hidden variable in the SingleCellExperiment object to track them: isSpike(sceset, "ERCC") <- grepl("ERCC-", rownames(sceset)) 4.11 Reading the data (10X) Due to the large size and sparsity of 10X data (upto 90% of the expression matrix may be 0s) it is typically stored as a sparse matrix. The default output format for CellRanger is an .mtx file which stores this sparse matrix as a column of row coordinates, a column of column corodinates, and a column of expression values > 0. Note if you look at the .mtx file you will see two header lines followed by a line detailing the total number of rows, columns and counts for the full matrix. Since only the coordinates are stored in the .mtx file, the names of each row & column must be stored separately in the “genes.tsv†and “barcodes.tsv†files respectively. We will be using the “Matrix†package to store matrices in sparse-matrix format in R. The SingleCellExperiment class naturally handles parse matrices, and many downstream tools including scater, scran and DropletUtils also handle data stored in sparse matrices, reducing the memory requirements for many early steps in an analysis. The SingleCellExperiment class can also use data in HDF5 format which allows large non-sparse matrices to be stored & accessed on disk in an efficient manner rather than loading the whole thing into RAM. library("Matrix") cellbarcodes <- read.table("droplet/Kidney-10X_P4_5/barcodes.tsv") genenames <- read.table("droplet/Kidney-10X_P4_5/genes.tsv") molecules <- readMM("droplet/Kidney-10X_P4_5/matrix.mtx") Now we will add the appropriate row and column names. However, if you inspect the read cellbarcodes you will see that they are just the barcode sequence associated with each cell. This is a problem since each batch of 10X data uses the same pool of barcodes so if we need to combine data from multiple 10X batches the cellbarcodes will not be unique. Hence we will attach the batch ID to each cell barcode: head(cellbarcodes) rownames(molecules) <- genenames[,1] colnames(molecules) <- paste("10X_P4_5", cellbarcodes[,1], sep="_") Now lets get the metadata and computational annotations for this data: meta <- read.delim("droplet_metadata.csv", sep=",", header = TRUE) head(meta) Here we can see that we need to use 10X_P4_5 to find the metadata for this batch, also note that the format of the mouse ID is different in this metadata table with hyphens instead of underscores and with the gender in the middle of the ID. From checking the methods section of the accompanying paper we know that the same 8 mice were used for both droplet and plate-based techniques. So we need to fix the mouse IDs to be consistent with those used in the FACS experiments. meta[meta$channel == "10X_P4_5",] mouseID <- "3_8_M" Note: depending on the tissue you choose you may have 10X data from mixed samples : e.g. mouse id = 3-M-5/6. You should still reformat these to be consistent but they will not match mouse ids from the FACS data which may affect your downstream analysis. If the mice weren’t from an inbred strain it would be possible to assign individual cells to a specific mouse using exonic-SNPs but that is beyond the scope of this course. ann <- read.delim("droplet_annotation.csv", sep=",", header=TRUE) head(ann) Again you will find a slight formating difference between the cellID in the annotation and the cellbarcodes which we will have to correct before matching them. ann[,1] <- paste(ann[,1], "-1", sep="") ann_subset <- ann[match(colnames(molecules), ann[,1]),] celltype <- ann_subset[,3] Now lets build the cell-metadata dataframe: cell_anns <- data.frame(mouse = rep(mouseID, times=ncol(molecules)), type=celltype) rownames(cell_anns) <- colnames(molecules); Exercise Repeat the above for the other 10X batches for your tissue. Answer 4.12 Building a SingleCellExperiment object for the 10X data Now that we have read the 10X data in multiple batches we need to combine them into a single SingleCellExperiment object. First we will check that the gene names are the same and in the same order across all batches: identical(rownames(molecules1), rownames(molecules2)) identical(rownames(molecules1), rownames(molecules3)) Now we’ll check that there aren’t any repeated cellIDs: sum(colnames(molecules1) %in% colnames(molecules2)) sum(colnames(molecules1) %in% colnames(molecules3)) sum(colnames(molecules2) %in% colnames(molecules3)) Everything is ok, so we can go ahead and combine them: all_molecules <- cbind(molecules1, molecules2, molecules3) all_cell_anns <- as.data.frame(rbind(cell_anns1, cell_anns2, cell_anns3)) all_cell_anns$batch <- rep(c("10X_P4_5", "10X_P4_6","10X_P7_5"), times = c(nrow(cell_anns1), nrow(cell_anns2), nrow(cell_anns3))) Exercise How many cells are in the whole dataset? Answer Now build the SingleCellExperiment object. One of the advantages of the SingleCellExperiment class is that it is capable of storing data in normal matrix or sparse matrix format, as well as HDF5 format which allows large non-sparse matrices to be stored & accessed on disk in an efficient manner rather than loading the whole thing into RAM. all_molecules <- as.matrix(all_molecules) sceset <- SingleCellExperiment( assays = list(counts = as.matrix(all_molecules)), colData = all_cell_anns ) Since this is 10X data it will not contain spike-ins, so we just save the data: saveRDS(sceset, "kidney_droplet.rds") 4.13 Advanced Exercise Write an R function/script which will fully automate this procedure for each data-type for any tissue. References "], +["processing-raw-scrna-seq-data.html", "5 Processing raw scRNA-seq data 5.1 Generating fastq files from BCLs 5.2 FastQC 5.3 Trimming Reads 5.4 Fastp 5.5 Read alignment and gene expression quantification 5.6 Full-length transcript datasets 5.7 Tag-based datasets 5.8 Practise 5.9 Identifying cell-containing droplets/microwells", " 5 Processing raw scRNA-seq data 5.1 Generating fastq files from BCLs BCLs (Illumina sequencer’s base call files) are binary files with raw sequencing data generated from sequencers. If your data processing starts BCLs you will need to make fastq files from the BCL files. More on BCL format. For others, you may have received the fastq files from your sequencing facilities or collaborators, you can refer to Section 5.2 for pre-processing on fastq files. 5.1.1 Demultiplexing In cases where multiple sample libraries are pooled together for sequencing on one lane of a flowcell to reduce seqeuncing cost, we demultiplex the samples by their sample index in the step of making fastq files from BCLs. Sample indices are ‘barcodes’ for multiplexed samples which have been constructed in the read structure during the library preparation. Figure 2.3: Example 10X Final Library Structure 5.1.2 cellranger mkfastq If you are working with 10X Genomiec data, it is best to use the cellranger mkfastq pipleline, which wraps Illumina’s bcl2fastq and provides a number of convenient features designed specifically for 10X data format. In order to demultiplex samples, you would also need the sample_sheet.csv file which tells the mkfastq pipeline which libraries are sequenced on which lanes of the flowcell and what sample index sets they have. For example when you have multiple libraries sequenced on one lane here: With cellranger mkfastq, you can provide a simpleSampleSheet.csv file that has: Lane Sample Index 1 test_sample SI-P03-C9 1 test_sample2 SI-P03-A3 ... SI-P03-C9 and SI-P03-A3 are the 10x sample index set names. Each of them corresponds to a mix of 4 unique oligonucleotides so that the i7 index read is balanced across all 4 bases during sequencing. There are a list of 96 sample index sets and you can use any ones of them to ‘tag’ your samples. An example command to run cellranger mkfastq /mnt/Software/cellranger/cellranger-3.0.2/cellranger mkfastq \\ --run ./input.runs.folder/ --samplesheet {input.samplesheet.csv} \\ --id run_id \\ --qc --project MAXL_2019_LIM_organoid_RNAseq \\ --output-dir data/fastq_path/ \\ --jobmode=local --localcores=20 --localmem=50 1> {log} After mkfastq, you end up with each sample’s fastq files from each sequencing lanes: test_sample_S1_L001_I1_001.fastq.gz test_sample_S1_L001_R1_001.fastq.gz test_sample_S1_L001_R2_001.fastq.gz test_sample2_S2_L001_I1_001.fastq.gz test_sample2_S2_L001_R1_001.fastq.gz test_sample2_S2_L001_R2_001.fastq.gz 5.1.3 Illumina bcl2fastq You can also use Illumina’s bcl2fastq tool directly and it is more generally applicable. bcf2fastq converts BCLs to fastqs while optionally demultiplexing sequencing data. Find the documentation of the tool here; training videos may also help to come to grips with using this tool. Figure 2.4: Sample Demultiplexing You will need to supply a SampleSheet.csv file like this: Figure 2.5: SampleSheet.csv file This information should come from your sequencing facilities. Running bcl2fastq can then be done like this: /usr/local/bin/bcl2fastq --runfolder-dir <RunFolder> --output-dir <BaseCalls> The output fastq files are names as SampleName_SampleNumber_Lane_Read_001.fastq.gz same with cellranger mkfastq output. (eg: Sample1_S1_L001_R1_001.fastq.gz) 5.2 FastQC Once you’ve obtained your single-cell RNA-seq data, the first thing you need to do with it is check the quality of the reads you have sequenced. For this task, today we will be using a tool called FastQC. FastQC is a quality control tool for sequencing data, which can be used for both bulk and single-cell RNA-seq data. FastQC takes sequencing data as input and returns a report on read quality. Copy and paste this link into your browser to visit the FastQC website: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ This website contains links to download and install FastQC and documentation on the reports produced. Scroll down the webpage to ‘Example Reports’ and click ‘Good Illumina Data’. This gives an example of what an ideal report should look like for high quality Illumina reads data. Now let’s make a FastQC report ourselves. Today we will be performing our analysis using a single cell from an mESC dataset produced by (Kolodziejczyk et al. 2015). The cells were sequenced using the SMART-seq2 library preparation protocol and the reads are paired end. Note You will have to download the files (both ERR522959_1.fastq and ERR522959_2.fastq) and create Share directory yourself to run the commands. You can find the files here: https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-2600/samples/ Now let’s look at the files: less Share/ERR522959_1.fastq less Share/ERR522959_2.fastq Task 1: Try to work out what command you should use to produce the FastQC report. Hint: Try executing fastqc -h This command will tell you what options are available to pass to FastQC. Feel free to ask for help if you get stuck! If you are successful, you should generate a .zip and a .html file for both the forwards and the reverse reads files. Once you have been successful, feel free to have a go at the next section. 5.2.1 Solution and Downloading the Report If you haven’t done so already, generate the FastQC report using the commands below: mkdir fastqc_results fastqc -o fastqc_results Share/ERR522959_1.fastq Share/ERR522959_2.fastq Once the command has finished executing, you should have a total of four files - one zip file for each of the paired end reads, and one html file for each of the paired end reads. The report is in the html file. To view it, we will need to get it onto your computer using either filezilla or scp. Ask an instructor if you are having difficulties. Once the file is on you computer, click on it. Your FastQC report should open. Have a look through the file. Remember to look at both the forwards and the reverse end read reports! How good quality are the reads? Is there anything we should be concerned about? How might we address those concerns? 5.2.2 10X fastq qualities checks If you have generated the fastq files from cellranger mkfastq as we discussed before, you can get a list of quality metrics from the output files. First thing to look at will be qc_summary.json file which contains lines like this: "sample_qc": { "Sample1": { "5": { "barcode_exact_match_ratio": 0.9336158258904611, "barcode_q30_base_ratio": 0.9611993091728814, "bc_on_whitelist": 0.9447542078230667, "mean_barcode_qscore": 37.770630795934, "number_reads": 2748155, "read1_q30_base_ratio": 0.8947676653366835, "read2_q30_base_ratio": 0.7771883245304577 }, "all": { "barcode_exact_match_ratio": 0.9336158258904611, "barcode_q30_base_ratio": 0.9611993091728814, "bc_on_whitelist": 0.9447542078230667, "mean_barcode_qscore": 37.770630795934, "number_reads": 2748155, "read1_q30_base_ratio": 0.8947676653366835, "read2_q30_base_ratio": 0.7771883245304577 } } } 5.3 Trimming Reads Fortunately there is software available for read trimming. Today we will be using Trim Galore!. Trim Galore! is a wrapper for the reads trimming software cutadapt and fastqc. Read trimming software can be used to trim sequencing adapters and/or low quality reads from the ends of reads. Given we noticed there was some adaptor contamination in our FastQC report, it is a good idea to trim adaptors from our data. Task 2: What type of adapters were used in our data? Hint: Look at the FastQC report ‘Adapter Content’ plot. Now let’s try to use Trim Galore! to remove those problematic adapters. It’s a good idea to check read quality again after trimming, so after you have trimmed your reads you should use FastQC to produce another report. Task 3: Work out the command you should use to trim the adapters from our data. Hint 1: You can use the following command to find out what options you can pass to Trim Galore. trim_galore -h _Hint 2:** Read through the output of the above command carefully. The adaptor used in this experiment is quite common. Do you need to know the actual sequence of the adaptor to remove it? Task 3: Produce a FastQC report for your trimmed reads files. Is the adapter contamination gone? Once you think you have successfully trimmed your reads and have confirmed this by checking the FastQC report, feel free to check your results using the next section. 5.3.1 Solution You can use the command(s) below to trim the Nextera sequencing adapters: mkdir fastqc_trimmed_results trim_galore --nextera -o fastqc_trimmed_results Share/ERR522959_1.fastq Share/ERR522959_2.fastq Remember to generate new FastQC reports for your trimmed reads files! FastQC should now show that your reads pass the ‘Adaptor Content’ plot. Feel free to ask one of the instructors if you have any questions. Congratulations! You have now generated reads quality reports and performed adaptor trimming. In the next lab, we will use STAR and Kallisto to align our trimmed and quality-checked reads to a reference transcriptome. 5.4 Fastp Fastp is an ‘all-in-one’ pre-processing tool to run on fastq files which has integrated a lot aspects of quality profiling for both before and after filtering data (quality curves, base contents, KMER, Q20/Q30, GC Ratio, duplication, adapter contents and etc. Example usage: mkdir fastp_results fastp -i Share/ERR522959_1.fastq -I Share/ERR522959_2.fastq \\ -o fastp_results/ERR522959_1.fastp.fastq -O fastp_results/ERR522959_1.fastp.fastq \\ --length_required 20 --average_qual 20 --detect_adapter_for_pe --correction \\ -h fastp_results/ERR522959.html -j fastp_results/ERR522959.json 5.5 Read alignment and gene expression quantification Now we have trimmed our reads and established that they are of good quality, we would like to map them to a reference genome. This process is known as alignment. Some form of alignment is generally required if we want to quantify gene expression or find genes which are differentially expressed between samples. Many tools have been developed for read alignment. STAR (Dobin et al. 2013) is one of most popularly used tools in RNA-seq read alignment. There are a bunch of alignment and gene expression quantification tools that are designed specifically for single-cell RNA-seq data too. Depending on your single-cell RNA-seq protocols used and the datasets generated, you can go with the following two workflows presented here for full-length or tag-based datasets. Thus, today we will focus on two alignment tools: STAR and Kallisto-BUStools, and we will discuss other available tools at the end of this chapter. 5.6 Full-length transcript datasets If your single-cell RNA-seq dataset is from plate-based protocol like Smart-seq2, then your dataset can be aligned and quantifified just like a bulk RNA-seq datasest. Each cell has a proper pair (if it’s paired-end sequncing) of fastq files (there are no CB/UMI tags in the reads). STAR is a good choice for alignment (other good choices could be Subread or Hisat2). 5.6.1 Using STAR to align reads STAR tries to find the longest possible sequence which matches one or more sequences in the reference genome. For example, in the figure below, we have a read (blue) which spans two exons and an alternative splicing junction (purple). STAR finds that the first part of the read is the same as the sequence of the first exon, whilst the second part of the read matches the sequence in the second exon. Because STAR is able to recognise splicing events in this way, it is described as a ‘splice aware’ aligner. Figure 2.3: Diagram of how STAR performs alignments, taken from Dobin et al. Usually STAR aligns reads to a reference genome, potentially allowing it to detect novel splicing events or chromosomal rearrangements. 5.6.2 Expression quantification Now you have your aligned reads in a .bam file for your single cells. The next step is to quantify the expression level of each gene per cell. We can use one of the tools which has been developed for bulk RNA-seq data, e.g. HT-seq or FeatureCounts which do ‘simple’ counting of reads overlapping with genomic features. Here we demostrate an example with featureCounts, that counts mapped reads for genomic features such as genes, exons, promoter, gene bodies, genomic bins and chromosomal locations. # include multimapping <featureCounts_path>/featureCounts -O -M -Q 30 -p -a hg_annotations.gtf -o outputfile ERR522959.bam # exclude multimapping <featureCounts_path>/featureCounts -Q 30 -p -a hg_annotations.gtf -o outputfile ERR522959.bam Then you will have your read counts gene expression matrix that’s ready for downstream analysis. 5.7 Tag-based datasets If your dataset is tag-based, for example 10X dataset, then you typically have sequences in R1 that entirely encode for read identities such as Cell Barcode and UMI tags. In most cases, all your cell reads (for 1k-10K cells) are in one set of fastq files. Instead of trying to demultiplex all cells into separate fastqs then do alignment and quantification, we can use tools that take care of this for you. In the following steps, we use Kallisto with bustools for generating the gene expression quantification matrix for your tag-based datasets. 5.7.1 Cellranger count If you work with 10X dataset, cellranger count pipeline may just work well for you. It comes with cellranger software suite with convenient features for 10X datasets. It takes the fastqs of a sample, and uses STAR to align all cells’ reads. It also includes reads filtering, barcode counting, and UMI counting. The output of this pipeline includes the aligned.bam file and the quantified gene expression matrix in both filtered and raw format. In V3, it has adopted the EmptyDroplet method (Lun et al., 2018), an algorithm that tries distinguish true cell barcodes from barcodes associated with droplets that did not contain a cell (i.e. empty droplets). (More details in Section 5.9 ). The filtered gene expression matrix by cellranger count V3 only includes the true cell barcodes determined by EmptyDroplet method. 5.7.2 Kallisto/bustools and pseudo-alignment STAR is a reads aligner, whereas Kallisto is a pseudo-aligner (Bray et al. 2016). The main difference between aligners and pseudo-aligners is that whereas aligners map reads to a reference, pseudo-aligners map k-mers to a reference. 5.7.3 What is a k-mer? A k-mer is a sequence of length k derived from a read. For example, imagine we have a read with the sequence ATCCCGGGTTAT and we want to make 7-mers from it. To do this, we would find the first 7-mer by counting the first seven bases of the read. We would find the second 7-mer by moving one base along, then counting the next seven bases. Below shows all the 7-mers that could be derived from our read: ATCCCGGGTTAT ATCCCGG TCCCGGG CCCGGGT CCGGGTT CGGGTTA GGGTTAT 5.7.4 Why map k-mers rather than reads? There are two main reasons: Pseudo-aligners use k-mers and a computational trick to make pseudo-alignment much faster than traditional aligners. If you are interested in how this is acheived, see (Bray et al. 2016) for details. Under some circumstances, pseudo-aligners may be able to cope better with sequencing errors than traditional aligners. For example, imagine there was a sequencing error in the first base of the read above and the A was actually a T. This would impact on the pseudo-aligners ability to map the first 7-mer but none of the following 7-mers. 5.7.5 Kallisto’s pseudo mode Kallisto has a specially designed mode for pseudo-aligning reads from single-cell RNA-seq experiments. Unlike STAR, Kallisto psuedo-aligns to a reference transcriptome rather than a reference genome. This means Kallisto maps reads to splice isoforms rather than genes. Mapping reads to isoforms rather than genes is especially challenging for single-cell RNA-seq for the following reasons: Single-cell RNA-seq is lower coverage than bulk RNA-seq, meaning the total amount of information available from reads is reduced. Many single-cell RNA-seq protocols have 3’ coverage bias, meaning if two isoforms differ only at their 5’ end, it might not be possible to work out which isoform the read came from. Some single-cell RNA-seq protocols have short read lengths, which can also mean it is not possible to work out which isoform the read came from. Kallisto’s pseudo mode takes a slightly different approach to pseudo-alignment. Instead of aligning to isoforms, Kallisto aligns to equivalence classes. Essentially, this means if a read maps to multiple isoforms, Kallisto records the read as mapping to an equivalence class containing all the isoforms it maps to. Figure 2 shows a diagram which helps explain this. Figure 2.4: Overview of kallisto, The input consists of a reference transcriptome and reads from an RNA-seq experiment. (a) An example of a read (in black) and three overlapping transcripts with exonic regions as shown. (b) An index is constructed by creating the transcriptome de Bruijn Graph (T-DBG) where nodes (v1, v2, v3, … ) are k-mers, each transcript corresponds to a colored path as shown and the path cover of the transcriptome induces a k-compatibility class for each k-mer. (c) Conceptually, the k-mers of a read are hashed (black nodes) to find the k-compatibility class of a read. (d) Skipping (black dashed lines) uses the information stored in the T-DBG to skip k-mers that are redundant because they have the same k-compatibility class. (e) The k-compatibility class of the read is determined by taking the intersection of the k-compatibility classes of its constituent k-mers. Taken from Bray et al (2016). Figure 2.5: A diagram explaining Kallisto’s Equivalence Classes, taken from Ntranos et al. Note Instead of using gene or isoform expression estimates in downstream analysis such as clustering, equivalence class counts can be used instead, in this course, we focus on using gene level estimation. 5.7.6 Running kallisto pseudo-alignment and BUStools Today, we will talk about doing single-cell pseudo-alignemnt and gene level quantification with Kallisto|BUStools. See https://pachterlab.github.io/kallisto/manual for details. As for STAR, you will need to produce an index for Kallisto before the pseudo-alignment step. Use the below command to produce the Kallisto index. Use the Kallisto manual (https://pachterlab.github.io/kallisto/manual) to work out what the options do in this command. mkdir indices/Kallisto kallisto index -i indices/Kallisto/GRCm38.idx Share/mouse/Ensembl.GRCm38.96/Mus_musculus.GRCm38.cdna.all.fa.gz In this step, an index is constructed by creating the transcriptome de Bruijn Graph (T-DBG). 5.7.6.1 BUS format BUS is a binary file format designed for UMI-tagged single-cell datasets with pseudo-aligned reads labelled with CB and UMI tags. Figure 2.7: BUS format, taken from Melsted,Páll et al. We do kallisto bus on the fastqs of single cells to generate the BUS file and then use BUStools on the generated bus files to get a gene level quantification. Check the list of technologies supported by Kallisto BUStools by kallisto bus -l Use the below command to perform pseudo-alignment and generate bus files for single-cell sequencing data. -x argument specifies the technology. List of supported single-cell technologies short name description ---------- ----------- 10xv1 10x version 1 chemistry 10xv2 10x version 2 chemistry 10xv3 10x version 3 chemistry CELSeq CEL-Seq CELSeq2 CEL-Seq version 2 DropSeq DropSeq inDrops inDrops SCRBSeq SCRB-Seq SureCell SureCell for ddSEQ mkdir results/Kallisto kallisto bus -i indices/Kallisto/GRCm38.idx -o results/Kallisto/output_bus -x '10xv2' -t 4 \\ SI-GA-G1/W11_S1_L001_R1_001.fastq.gz SI-GA-G1/W11_S1_L001_R2_001.fastq.gz \\ SI-GA-G1/W11_S1_L002_R1_001.fastq.gz SI-GA-G1/W11_S1_L002_R2_001.fastq.gz See https://pachterlab.github.io/kallisto/manual for instructions on creating bus files. 5.7.7 Understanding the Output of Kallisto BUS Pseudo-Alignment The command above should produce 4 files - matrix.ec, transcripts.txt, run_info.json and output.bus transcripts.txt contains a list of transcript, in the same order as in the transcriptome fasta file. matrix.ec contains information about the equivalence classes used. The first number in each row is the equivalence class ID. The second number(s) correspond to the transcript ID(s) in that equivalence class. For example “10 1,2,3†would mean that equivalence class 10 contains transcript IDs 1,2 and 3. The ID numbers correspond to the order that the transcripts appear in transcripts.txt. Zero indexing is used, meaning transcript IDs 1,2 and 3 correspond to the second, third and fourth transcripts in transcripts.txt. output.bus contains the binary formated Cell Barcode and UMI tags and Sets of equivalent classes of transcripts obtained by pseudoalignment.(The fourth column is count of reads with this barcode, UMI, and equivalence class combination, which is ignored as one UMI should stand for one molecule.) run_info.json contains information about how Kallisto was executed and can be ignored. 5.7.8 Running Bustools Inputs: transcripts_to_genes.tsv: a tab delimited file of a specific format: No headers, first column is transcript ID, and second column is the corresponding gene ID. Transcript IDs must be in the same order as in the kallisto index. barcode whitelist: A whitelist that contains all the barcodes known to be present in the kit is provided by 10x and comes with CellRanger. First, bustools runs barcode error correction on the bus file. Then, the corrected bus file is sorted by barcode, UMI, and equivalence classes. After that the UMIs are counted and the counts are collapsed to the gene level. mkdir ./output/out_bustools/genecount ./tmp bustools correct -w ./data/whitelist_v2.txt -p ./output/out_bustools/output.bus | \\ bustools sort -T tmp/ -t 4 -p - | \\ bustools count -o ./output/out_bustools/genecount/genes -g ./output/tr2g_hgmm.tsv \\ -e ./output/out_bustools/matrix.ec -t ./output/out_bustools/transcripts.txt --genecounts - The output includes: genes.barcodes.txt genes.genes.txt genes.mtx 5.7.9 Other alignment and quantification tools available Alevin Alevin is a tool for 10X and Drop-seq data that comes with Salmon which is also a ‘pseudo-aligner’ for transcriptome quantification. Salmon is conceptually simiarly to Kallisto but uses different models for parameter estimation and account for sequence (3’ 5’-end and Fragment GC) bias correction. STARsolo STARsolo is integrated with STAR. It does mapping, demultiplexing and gene quantiï¬cation for droplet-based single-cell RNA-seq (eg. 10X genomics). It follows a similar logic as Cellranger count pipeline which does error correction, UMI deduplication and then quantify expression per gene for each cell by counting reads with different UMIs mapped per gene. STARsolo is potentially ten times faster than Cellranger count. If you are interested, here is a paper by Páll et al that compares performance of workflows in single-cell RNA-seq preprocessing. (https://www.biorxiv.org/content/10.1101/673285v2.full). 5.7.10 Summary Full-transcripts dataset: STAR -> featureCounts Tag-based dataset: Kallisto bus -> Bustools 5.8 Practise 5.8.1 Using STAR One issue with STAR is that it needs a lot of RAM, especially if your reference genome is large (eg. mouse and human). To speed up our analysis today, we will use STAR to align reads to a reference genome. Two steps are required to perform STAR alignment. In the first step, the user provides STAR with reference genome sequences (FASTA) and annotations (GTF), which STAR uses to create a genome index. In the second step, STAR maps the user’s reads data to the genome index. Let’s create the index now. You can obtain genomes for many model organisms from Ensembl (https://www.ensembl.org/info/data/ftp/index.html**. Task 1: Execute the commands below to create the index: mkdir indices mkdir indices/STAR STAR --runThreadN 4 --runMode genomeGenerate --genomeDir indices/STAR --genomeFastaFiles Share/hg19.fa --sjdbGTFfile Share/hg_annotations.gtf Task 2: What does each of the options we used do? Hint: Use the STAR manual to help you (https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf** Task 3: How would the command we used in Task 1 be different if we were aligning to the genome rather than the transcriptome? Now that we have created the index, we can perform the mapping step. Task 4: Try to work out what command you should use to map our trimmed reads (from ERR522959** to the index you created. Use the STAR manual to help you. One you think you know the answer, check whether it matches the solution in the next section and execute the alignment. Task 5: Try to understand the output of your alignment. Talk to one of the instructors if you need help! 5.8.2 Solution for STAR Alignment You can use the folowing commands to perform the mapping step: mkdir results mkdir results/STAR STAR --runThreadN 4 --genomeDir indices/STAR --readFilesIn Share/ERR522959_1.fastq Share/ERR522959_2.fastq \\ --outFileNamePrefix results/STAR/ERR522959 5.9 Identifying cell-containing droplets/microwells For droplet based methods only a fraction of droplets contain both beads and an intact cell. However, biology experiments are messy and some RNA will leak out of dead/damaged cells. So droplets without an intact cell are likely to capture a small amount of the ambient RNA which will end up in the sequencing library and contribute a reads to the final sequencing output. The variation in droplet size, amplification efficiency, and sequencing will lead both “background†and real cells to have a wide range of library sizes. Various approaches have been used to try to distinguish those cell barcodes which correspond to real cells. 5.9.1 ‘Knee’ point One of the most used methods use the total molecules (could be applied to total reads) per barcode and try to find a “break point†between bigger libraries which are cells + some background and smaller libraries assumed to be purely background. Let’s load some example simulated data which contain both large and small cells: umi_per_barcode <- read.table("data/droplet_id_example_per_barcode.txt.gz") truth <- read.delim("data/droplet_id_example_truth.gz", sep=",") Exercise How many unique barcodes were detected? How many true cells are present in the data? To simplify calculations for this section exclude all barcodes with fewer than 10 total molecules. Answer One approach is to look for the inflection point where the total molecules per barcode suddenly drops: barcode_rank <- rank(-umi_per_barcode[,2]) plot(barcode_rank, umi_per_barcode[,2], xlim=c(1,8000)) Here we can see an roughly exponential curve of library sizes, so to make things simpler lets log-transform them. log_lib_size <- log10(umi_per_barcode[,2]) plot(barcode_rank, log_lib_size, xlim=c(1,8000)) That’s better, the “knee†in the distribution is much more pronounced. We could manually estimate where the “knee†is but it much more reproducible to algorithmically identify this point. # inflection point o <- order(barcode_rank) log_lib_size <- log_lib_size[o] barcode_rank <- barcode_rank[o] rawdiff <- diff(log_lib_size)/diff(barcode_rank) inflection <- which(rawdiff == min(rawdiff[100:length(rawdiff)], na.rm=TRUE)) plot(barcode_rank, log_lib_size, xlim=c(1,8000)) abline(v=inflection, col="red", lwd=2) threshold <- 10^log_lib_size[inflection] cells <- umi_per_barcode[umi_per_barcode[,2] > threshold,1] TPR <- sum(cells %in% truth[,1])/length(cells) Recall <- sum(cells %in% truth[,1])/length(truth[,1]) c(TPR, Recall) ## [1] 1.0000000 0.7831707 5.9.2 Mixture model Another is to fix a mixture model and find where the higher and lower distributions intersect. However, data may not fit the assumed distributions very well: set.seed(-92497) # mixture model require("mixtools") ## Loading required package: mixtools ## mixtools package, version 1.1.0, Released 2017-03-10 ## This package is based upon work supported by the National Science Foundation under Grant No. SES-0518772. mix <- normalmixEM(log_lib_size) ## number of iterations= 43 plot(mix, which=2, xlab2="log(mol per cell)") p1 <- dnorm(log_lib_size, mean=mix$mu[1], sd=mix$sigma[1]) p2 <- dnorm(log_lib_size, mean=mix$mu[2], sd=mix$sigma[2]) if (mix$mu[1] < mix$mu[2]) { split <- min(log_lib_size[p2 > p1]) } else { split <- min(log_lib_size[p1 > p2]) } Exercise Identify cells using this split point and calculate the TPR and Recall. Answer 5.9.3 Expected Number of Cells A third method used by CellRanger V2, assumes a ~10-fold range of library sizes for real cells and estimates this range using the expected number of cells. n_cells <- length(truth[,1]) # CellRanger v2 totals <- umi_per_barcode[,2] totals <- sort(totals, decreasing = TRUE) # 99th percentile of top n_cells divided by 10 thresh = totals[round(0.01*n_cells)]/10 plot(totals, xlim=c(1,8000)) abline(h=thresh, col="red", lwd=2) Exercise Identify cells using this threshodl and calculate the TPR and Recall. Answer 5.9.4 EmptyDroplets Finally (EmptyDrops)[https://github.com/MarioniLab/DropletUtils] is what we recommend using in calling cell barcodes for droplet-based single-cell datasets. It should be noted that in cellranger count v3, EmptyDroptlet algoritms has been applied in their filtering for true cell barcode step. Instead of trying to find a ‘threshold’ in UMIs counts for determining true cells, EmptyDroplet uses the full genes x cells molecule count matrix for all droplets and estimates the profile of “background†RNA from those droplets with extremely low counts, then looks for cells with gene-expression profiles which differ from the background. This is combined with an inflection point method since background RNA often looks very similar to the expression profile of the largests cells in a population. As such EmptyDrops is the only method able to identify barcodes for very small cells in highly diverse samples. Below we have provided code for how this method is currently run: library("Matrix") raw.counts <- readRDS("data/pancreas/muraro.rds") library("DropletUtils") example(write10xCounts, echo=FALSE) dir.name <- tmpdir list.files(dir.name) sce <- read10xCounts(dir.name) sce my.counts <- DropletUtils:::simCounts() br.out <- barcodeRanks(my.counts) # Making a plot. plot(br.out$rank, br.out$total, log="xy", xlab="Rank", ylab="Total") o <- order(br.out$rank) lines(br.out$rank[o], br.out$fitted[o], col="red") abline(h=metadata(br.out)$knee, col="dodgerblue", lty=2) abline(h=metadata(br.out)$inflection, col="forestgreen", lty=2) legend("bottomleft", lty=2, col=c("dodgerblue", "forestgreen"), legend=c("knee", "inflection")) # emptyDrops set.seed(100) e.out <- emptyDrops(my.counts) is.cell <- e.out$FDR <= 0.01 sum(is.cell, na.rm=TRUE) plot(e.out$Total, -e.out$LogProb, col=ifelse(is.cell, "red", "black"), xlab="Total UMI count", ylab="-Log Probability") # plot(e.out$Total, -e.out$LogProb, col=ifelse(is.cell, "red", "black"), # xlab="Total UMI count", ylab="-Log Probability") # # cells <- colnames(raw.counts)[is.cell] # # TPR <- sum(cells %in% truth[,1])/length(cells) # Recall <- sum(cells %in% truth[,1])/length(truth[,1]) # c(TPR, Recall) References "], +["quality-control-and-data-visualisation.html", "6 Quality control and data visualisation 6.1 Expression QC overview (UMI) 6.2 Cell QC 6.3 Doublet detection 6.4 Gene QC 6.5 Exercise: Expression QC (Reads) 6.6 Data visualization and exploratory data analysis 6.7 Exercise: Data visualization (Reads)", " 6 Quality control and data visualisation The principle of garbage in, garbage out is at least as strong in single-cell genomics as it is elsewere in science. Effective quality control (QC) is crucial to high-quality scRNA-seq data analysis. We discuss principles and strategies for QC in this chapter, along with some discussion and demonstration of data visualisation approaches. 6.1 Expression QC overview (UMI) 6.1.1 Introduction Once gene expression has been quantified it is summarized as an expression matrix where each row corresponds to a gene (or transcript) and each column corresponds to a single cell. This matrix should be examined to remove poor quality cells which were not detected in either read QC or mapping QC steps. Failure to remove low quality cells at this stage may add technical noise which has the potential to obscure the biological signals of interest in the downstream analysis. Since there is currently no standard method for performing scRNASeq the expected values for the various QC measures that will be presented here can vary substantially from experiment to experiment. Thus, to perform QC we will be looking for cells which are outliers with respect to the rest of the dataset rather than comparing to independent quality standards. Consequently, care should be taken when comparing quality metrics across datasets collected using different protocols. 6.1.2 Tung dataset To illustrate cell QC, we consider a dataset of induced pluripotent stem cells generated from three different individuals (Tung et al. 2017) in Yoav Gilad’s lab at the University of Chicago. The experiments were carried out on the Fluidigm C1 platform and to facilitate the quantification both unique molecular identifiers (UMIs) and ERCC spike-ins were used. The data files are located in the tung folder in your working directory. These files are the copies of the original files made on the 15/03/16. We will use these copies for reproducibility purposes. library(SingleCellExperiment) library(scater) options(stringsAsFactors = FALSE) Load the data and annotations: molecules <- read.table("data/tung/molecules.txt", sep = "\\t") anno <- read.table("data/tung/annotation.txt", sep = "\\t", header = TRUE) Inspect a small portion of the expression matrix head(molecules[ , 1:3]) ## NA19098.r1.A01 NA19098.r1.A02 NA19098.r1.A03 ## ENSG00000237683 0 0 0 ## ENSG00000187634 0 0 0 ## ENSG00000188976 3 6 1 ## ENSG00000187961 0 0 0 ## ENSG00000187583 0 0 0 ## ENSG00000187642 0 0 0 head(anno) ## individual replicate well batch sample_id ## 1 NA19098 r1 A01 NA19098.r1 NA19098.r1.A01 ## 2 NA19098 r1 A02 NA19098.r1 NA19098.r1.A02 ## 3 NA19098 r1 A03 NA19098.r1 NA19098.r1.A03 ## 4 NA19098 r1 A04 NA19098.r1 NA19098.r1.A04 ## 5 NA19098 r1 A05 NA19098.r1 NA19098.r1.A05 ## 6 NA19098 r1 A06 NA19098.r1 NA19098.r1.A06 The data consists of 3 individuals and r length(unique(anno$replicate)) replicates and therefore has r length(unique(anno$batch)) batches in total. We standardize the analysis by using both SingleCellExperiment (SCE) and scater packages. First, create the SCE object: umi <- SingleCellExperiment( assays = list(counts = as.matrix(molecules)), colData = anno ) Remove genes that are not expressed in any cell: keep_feature <- rowSums(counts(umi) > 0) > 0 umi <- umi[keep_feature, ] Define control features (genes) - ERCC spike-ins and mitochondrial genes (provided by the authors): isSpike(umi, "ERCC") <- grepl("^ERCC-", rownames(umi)) isSpike(umi, "MT") <- rownames(umi) %in% c("ENSG00000198899", "ENSG00000198727", "ENSG00000198888", "ENSG00000198886", "ENSG00000212907", "ENSG00000198786", "ENSG00000198695", "ENSG00000198712", "ENSG00000198804", "ENSG00000198763", "ENSG00000228253", "ENSG00000198938", "ENSG00000198840") Calculate the quality metrics: umi <- calculateQCMetrics( umi, feature_controls = list( ERCC = isSpike(umi, "ERCC"), MT = isSpike(umi, "MT") ) ) ## Warning in calculateQCMetrics(umi, feature_controls = list(ERCC = ## isSpike(umi, : spike-in set 'ERCC' overwritten by feature_controls set of ## the same name 6.2 Cell QC 6.2.1 Library size Next we consider the total number of RNA molecules detected per sample (if we were using read counts rather than UMI counts this would be the total number of reads). Wells with few reads/molecules are likely to have been broken or failed to capture a cell, and should thus be removed. hist( umi$total_counts, breaks = 100 ) abline(v = 25000, col = "red") Figure 6.1: Histogram of library sizes for all cells Exercise 1 How many cells does our filter remove? What distribution do you expect that the total number of molecules for each cell should follow? Our answer ## filter_by_total_counts ## FALSE TRUE ## 46 818 6.2.2 Detected genes In addition to ensuring sufficient sequencing depth for each sample, we also want to make sure that the reads are distributed across the transcriptome. Thus, we count the total number of unique genes detected in each sample. hist( umi$total_features_by_counts, breaks = 100 ) abline(v = 7000, col = "red") Figure 6.2: Histogram of the number of detected genes in all cells From the plot we conclude that most cells have between 7,000-10,000 detected genes, which is normal for high-depth scRNA-seq. However, this varies by experimental protocol and sequencing depth. For example, droplet-based methods or samples with lower sequencing-depth typically detect fewer genes per cell. The most notable feature in the above plot is the “heavy tail†on the left hand side of the distribution. If detection rates were equal across the cells then the distribution should be approximately normal. Thus we remove those cells in the tail of the distribution (fewer than 7,000 detected genes). Exercise 2 How many cells does our filter remove? Our answer ## filter_by_expr_features ## FALSE TRUE ## 116 748 6.2.3 ERCCs and MTs Another measure of cell quality is the ratio between ERCC spike-in RNAs and endogenous RNAs. This ratio can be used to estimate the total amount of RNA in the captured cells. Cells with a high level of spike-in RNAs had low starting amounts of RNA, likely due to the cell being dead or stressed which may result in the RNA being degraded. plotColData( umi, x = "total_features_by_counts", y = "pct_counts_MT", colour = "batch" ) Figure 6.3: Percentage of counts in MT genes plotColData( umi, x = "total_features_by_counts", y = "pct_counts_ERCC", colour = "batch" ) Figure 6.4: Percentage of counts in ERCCs The above analysis shows that majority of the cells from NA19098.r2 batch have a very high ERCC/Endo ratio. Indeed, it has been shown by the authors that this batch contains cells of smaller size. Exercise 3 Create filters for removing batch NA19098.r2 and cells with high expression of mitochondrial genes (>10% of total counts in a cell). Our answer ## filter_by_ERCC ## FALSE TRUE ## 96 768 ## filter_by_MT ## FALSE TRUE ## 31 833 Exercise 4 What would you expect to see in the ERCC vs counts plot if you were examining a dataset containing cells of different sizes (eg. normal & senescent cells)? Answer You would expect to see a group corresponding to the smaller cells (normal) with a higher fraction of ERCC reads than a separate group corresponding to the larger cells (senescent). 6.2.4 Cell filtering 6.2.4.1 Manual Now we can define a cell filter based on our previous analysis: umi$use <- ( # sufficient features (genes) filter_by_expr_features & # sufficient molecules counted filter_by_total_counts & # sufficient endogenous RNA filter_by_ERCC & # remove cells with unusual number of reads in MT genes filter_by_MT ) table(umi$use) ## ## FALSE TRUE ## 207 657 6.2.4.2 Automatic Another option available in scater is to conduct PCA on a set of QC metrics and then use automatic outlier detection to identify potentially problematic cells. By default, the following metrics are used for PCA-based outlier detection: pct_counts_top_100_features total_features pct_counts_feature_controls n_detected_feature_controls log10_counts_endogenous_features log10_counts_feature_controls scater first creates a matrix where the rows represent cells and the columns represent the different QC metrics. Then, outlier cells can also be identified by using the mvoutlier package on the QC metrics for all cells. This will identify cells that have substantially different QC metrics from the others, possibly corresponding to low-quality cells. We can visualize any outliers using a principal components plot as shown below: umi <- runPCA( umi, use_coldata = TRUE, detect_outliers = TRUE ) reducedDimNames(umi) ## [1] "PCA_coldata" Column subsetting can then be performed based on the $outlier slot, which indicates whether or not each cell has been designated as an outlier. Automatic outlier detection can be informative, but a close inspection of QC metrics and tailored filtering for the specifics of the dataset at hand is strongly recommended. table(umi$outlier) ## ## FALSE TRUE ## 791 73 Then, we can use a PCA plot to see a 2D representation of the cells ordered by their quality metrics. plotReducedDim( umi, use_dimred = "PCA_coldata", size_by = "total_features_by_counts", shape_by = "use", colour_by = "outlier" ) 6.2.5 Compare filterings Exercise 5 Compare the default, automatic and manual cell filters. Plot a Venn diagram of the outlier cells from these filterings. Hint: Use vennCounts and vennDiagram functions from the limma package to make a Venn diagram. Answer library(limma) auto <- colnames(umi)[umi$outlier] man <- colnames(umi)[!umi$use] venn.diag <- vennCounts( cbind(colnames(umi) %in% auto, colnames(umi) %in% man) ) vennDiagram( venn.diag, names = c("Automatic", "Manual"), circle.col = c("blue", "green") ) Figure 6.5: Comparison of the default, automatic and manual cell filters 6.3 Doublet detection For droplet-based datasets, there is chance that multiple cells are enclosed in one droplet resulting one cell barcode actually containing read information from multiple cells. One way to find doublets/multiplets in the data is to see if there are cells co-expressing markers of distinct cell types. There are also computational tools available for detecting potential doublets in the cells. A lot of these tools rely on artificial doublets formed from the datasets by randomly joining the expression profiles of two cells. Then the cells are tested against the artificial doublet profiles. We demonstrate the usage of two of these doublet detection tools. 6.3.1 scds scds(???) has two detection methods: co-expression based; binary-classification based. In co-expression based approach, the gene-pairs’ co-expression probablities are estimated based on a binomial model and gene pairs that do not co-expression often get higher scores when they co-expression in some cells. The cells’ doublet scores are derived based on the co-expression of pairs of genes. In the binary classification based approach, artificial doublet clusters are generated and cells are difficult to separate from the artificial doublets get higher doublet scores. library(scds) #- Annotate doublet using co-expression based doublet scoring: umi = cxds(umi) #- Annotate doublet using binary classification based doublet scoring: umi = bcds(umi) ## [1] train-error:0.056712+0.006782 test-error:0.090820+0.022608 ## Multiple eval metrics are present. Will use test_error for early stopping. ## Will train until test_error hasn't improved in 2 rounds. ## ## [2] train-error:0.042102+0.002537 test-error:0.084458+0.011641 ## [3] train-error:0.031539+0.002448 test-error:0.071155+0.009566 ## [4] train-error:0.029224+0.001912 test-error:0.072279+0.017508 ## [5] train-error:0.024595+0.002624 test-error:0.066512+0.016282 ## [6] train-error:0.021412+0.001913 test-error:0.063073+0.009557 ## [7] train-error:0.018373+0.002762 test-error:0.056687+0.016847 ## [8] train-error:0.016636+0.004358 test-error:0.052079+0.011572 ## [9] train-error:0.014466+0.002777 test-error:0.051499+0.008444 ## [10] train-error:0.012731+0.001173 test-error:0.048021+0.010077 ## [11] train-error:0.012586+0.001800 test-error:0.046292+0.011280 ## [12] train-error:0.009692+0.002442 test-error:0.045707+0.009178 ## [13] train-error:0.007957+0.002586 test-error:0.043398+0.007749 ## [14] train-error:0.007378+0.002521 test-error:0.043393+0.009114 ## [15] train-error:0.007668+0.002402 test-error:0.043398+0.008171 ## [16] train-error:0.006944+0.002024 test-error:0.041084+0.009753 ## [17] train-error:0.004919+0.002115 test-error:0.038186+0.008825 ## [18] train-error:0.004774+0.002024 test-error:0.038761+0.008839 ## [19] train-error:0.003906+0.001863 test-error:0.037021+0.008974 ## [20] train-error:0.003038+0.001674 test-error:0.036447+0.008460 ## [21] train-error:0.002604+0.001084 test-error:0.037606+0.009120 ## [22] train-error:0.002604+0.000982 test-error:0.038181+0.010044 ## Stopping. Best iteration: ## [20] train-error:0.003038+0.001674 test-error:0.036447+0.008460 ## ## [1] train-error:0.065972 ## Will train until train_error hasn't improved in 2 rounds. ## ## [2] train-error:0.046875 ## [3] train-error:0.030671 ## [4] train-error:0.028356 ## [5] train-error:0.022569 ## [6] train-error:0.021412 ## [7] train-error:0.019676 ## [8] train-error:0.018519 ## [9] train-error:0.016204 ## [10] train-error:0.013310 ## [11] train-error:0.011574 ## [12] train-error:0.009838 ## [13] train-error:0.008102 #- Combine both annotations into a hybrid annotation umi = cxds_bcds_hybrid(umi) #- Doublet scores are now available via colData: CD = colData(umi) head(cbind(CD$cxds_score,CD$bcds_score, CD$hybrid_score)) ## [,1] [,2] [,3] ## NA19098.r1.A01 4131.405 0.013268524 0.2493021 ## NA19098.r1.A02 4564.089 0.006372486 0.2676119 ## NA19098.r1.A03 2827.904 0.002598290 0.1619169 ## NA19098.r1.A04 4708.213 0.013077467 0.2829361 ## NA19098.r1.A05 6134.590 0.005533409 0.3588618 ## NA19098.r1.A06 5810.730 0.006969100 0.3413388 plotColData( umi, x = "total_features_by_counts", y = "pct_counts_ERCC", colour = "hybrid_score" ) The scds paper features excellent descriptions and evaluations of other currently-available doublet detection methods. 6.3.2 DoubletDetection DoubletDetection is a python module that runs on raw UMI counts data. It generates artificial doublets and then perform cell clustering using the augmented dataset. Cells cluster closely to the artificial doublets across multiple iterations are predicted to be doublets. We provided the python scripts for running DoubletDetection on Tung datasets at ./mig_2019_scrnaseq-workshop/course_files/utils/run_doubletDetection.py python run_doubletDetection.py Here is the prediction results by DoubletDetection: require(UpSetR) ## Loading required package: UpSetR pred_tung <- read.delim(file = "data/doublets/tung.dbls.txt", header = FALSE) dim(pred_tung) ## [1] 864 1 dim(anno) ## [1] 864 5 umi$dbd_dbl <- factor(pred_tung$V1) qc_label <- read.delim(file = "data/qc_ipsc.txt") head(qc_label) ## individual replicate well cell_number concentration tra1.60 ## 1 NA19098 r1 A01 1 1.734785 1 ## 2 NA19098 r1 A02 1 1.723038 1 ## 3 NA19098 r1 A03 1 1.512786 1 ## 4 NA19098 r1 A04 1 1.347492 1 ## 5 NA19098 r1 A05 1 2.313047 1 ## 6 NA19098 r1 A06 1 2.056803 1 qc_label$sample_id <- paste0(qc_label$individual,".",qc_label$replicate,".",qc_label$well) rownames(qc_label) <- qc_label$sample_id umi$cell_number <- as.character(qc_label[umi$sample_id,"cell_number"]) umi$cell_number[qc_label$cell_number==0] <- "no_cell" umi$cell_number[qc_label$cell_number == 1] <- "single_cell" umi$cell_number[qc_label$cell_number>1] <- "multi_cell" multiplot(plotColData( umi, x = "total_features_by_counts", y = "pct_counts_ERCC", colour = "hybrid_score" ), plotColData( umi, x = "total_features_by_counts", y = "pct_counts_ERCC", colour = "dbd_dbl" ), plotColData( umi, x = "total_features_by_counts", y = "pct_counts_ERCC", colour = "cell_number" ),cols =2) doublets <- unique(umi$sample_id[umi$dbd_dbl =="1"], umi$sample_id[umi$hybrid_score > 0.8]) pl_list <- UpSetR::fromList(list(pred = doublets,qc_label = qc_label$sample_id[qc_label$cell_number >1])) UpSetR::upset(pl_list,sets = c("pred","qc_label")) 6.3.2.1 Other tools available: DoubletFinder DoubletCells as part of SimpleSingleCell Scrublet 6.4 Gene QC 6.4.1 Gene expression In addition to removing cells with poor quality, it is usually a good idea to exclude genes where we suspect that technical artefacts may have skewed the results. Moreover, inspection of the gene expression profiles may provide insights about how the experimental procedures could be improved. It is often instructive to consider the number of reads consumed by the top 50 expressed genes. plotHighestExprs(umi, exprs_values = "counts") Figure 6.6: Number of total counts consumed by the top 50 expressed genes The distributions are relatively flat indicating (but not guaranteeing!) good coverage of the full transcriptome of these cells. However, there are several spike-ins in the top 15 genes which suggests a greater dilution of the spike-ins may be preferrable if the experiment is to be repeated. 6.4.2 Gene filtering It is typically a good idea to remove genes whose expression level is considered “undetectableâ€. We define a gene as detectable if at least two cells contain more than 1 transcript from the gene. If we were considering read counts rather than UMI counts a reasonable threshold is to require at least five reads in at least two cells. However, in both cases the threshold strongly depends on the sequencing depth. It is important to keep in mind that genes must be filtered after cell filtering since some genes may only be detected in poor quality cells (note colData(umi)$use filter applied to the umi dataset). keep_feature <- nexprs( umi[,colData(umi)$use], byrow = TRUE, detection_limit = 1 ) >= 2 rowData(umi)$use <- keep_feature table(keep_feature) ## keep_feature ## FALSE TRUE ## 4660 14066 Depending on the cell-type, protocol and sequencing depth, other cut-offs may be appropriate. 6.4.3 Save the data Dimensions of the QCed dataset (do not forget about the gene filter we defined above): dim(umi[rowData(umi)$use, colData(umi)$use]) ## [1] 14066 657 Let’s create an additional slot with log-transformed counts (we will need it in the next chapters) and remove saved PCA results from the reducedDim slot: assay(umi, "logcounts_raw") <- log2(counts(umi) + 1) reducedDim(umi) <- NULL Save the data: saveRDS(umi, file = "data/tung/umi.rds") 6.4.4 Big Exercise Perform exactly the same QC analysis with read counts of the same Blischak data. Use tung/reads.txt file to load the reads. Once you have finished please compare your results to ours (next chapter). 6.4.5 sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] UpSetR_1.4.0 scds_1.0.0 ## [3] limma_3.40.6 scater_1.12.2 ## [5] ggplot2_3.2.1 SingleCellExperiment_1.6.0 ## [7] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 ## [9] BiocParallel_1.18.1 matrixStats_0.55.0 ## [11] Biobase_2.44.0 GenomicRanges_1.36.1 ## [13] GenomeInfoDb_1.20.0 IRanges_2.18.3 ## [15] S4Vectors_0.22.1 BiocGenerics_0.30.0 ## ## loaded via a namespace (and not attached): ## [1] ggbeeswarm_0.6.0 colorspace_1.4-1 ## [3] mvoutlier_2.0.9 class_7.3-15 ## [5] modeltools_0.2-22 rio_0.5.16 ## [7] mclust_5.4.5 XVector_0.24.0 ## [9] pls_2.7-1 BiocNeighbors_1.2.0 ## [11] cvTools_0.3.2 flexmix_2.3-15 ## [13] mvtnorm_1.0-11 ranger_0.11.2 ## [15] splines_3.6.0 sROC_0.1-2 ## [17] robustbase_0.93-5 knitr_1.25 ## [19] zeallot_0.1.0 robCompositions_2.1.0 ## [21] kernlab_0.9-27 cluster_2.1.0 ## [23] rrcov_1.4-7 compiler_3.6.0 ## [25] backports_1.1.4 assertthat_0.2.1 ## [27] Matrix_1.2-17 lazyeval_0.2.2 ## [29] BiocSingular_1.0.0 htmltools_0.3.6 ## [31] tools_3.6.0 rsvd_1.0.2 ## [33] gtable_0.3.0 glue_1.3.1 ## [35] GenomeInfoDbData_1.2.1 dplyr_0.8.3 ## [37] Rcpp_1.0.2 carData_3.0-2 ## [39] cellranger_1.1.0 zCompositions_1.3.2-1 ## [41] vctrs_0.2.0 sgeostat_1.0-27 ## [43] fpc_2.2-3 DelayedMatrixStats_1.6.1 ## [45] lmtest_0.9-37 xfun_0.9 ## [47] laeken_0.5.0 stringr_1.4.0 ## [49] openxlsx_4.1.0.1 lifecycle_0.1.0 ## [51] irlba_2.3.3 DEoptimR_1.0-8 ## [53] zlibbioc_1.30.0 MASS_7.3-51.1 ## [55] zoo_1.8-6 scales_1.0.0 ## [57] VIM_4.8.0 hms_0.5.1 ## [59] RColorBrewer_1.1-2 yaml_2.2.0 ## [61] curl_4.2 NADA_1.6-1 ## [63] gridExtra_2.3 reshape_0.8.8 ## [65] stringi_1.4.3 highr_0.8 ## [67] pcaPP_1.9-73 e1071_1.7-2 ## [69] boot_1.3-20 zip_2.0.4 ## [71] truncnorm_1.0-8 rlang_0.4.0 ## [73] pkgconfig_2.0.3 prabclus_2.3-1 ## [75] bitops_1.0-6 evaluate_0.14 ## [77] lattice_0.20-38 purrr_0.3.2 ## [79] labeling_0.3 cowplot_1.0.0 ## [81] tidyselect_0.2.5 GGally_1.4.0 ## [83] plyr_1.8.4 magrittr_1.5 ## [85] bookdown_0.13 R6_2.4.0 ## [87] pillar_1.4.2 haven_2.1.1 ## [89] foreign_0.8-70 withr_2.1.2 ## [91] survival_2.43-3 abind_1.4-5 ## [93] RCurl_1.95-4.12 sp_1.3-1 ## [95] nnet_7.3-12 tibble_2.1.3 ## [97] crayon_1.3.4 car_3.0-3 ## [99] xgboost_0.90.0.2 rmarkdown_1.15 ## [101] viridis_0.5.1 grid_3.6.0 ## [103] readxl_1.3.1 data.table_1.12.2 ## [105] forcats_0.4.0 diptest_0.75-7 ## [107] vcd_1.4-4 digest_0.6.21 ## [109] tidyr_1.0.0 munsell_0.5.0 ## [111] beeswarm_0.2.3 viridisLite_0.3.0 ## [113] vipor_0.4.5 6.5 Exercise: Expression QC (Reads) library(SingleCellExperiment) library(scater) options(stringsAsFactors = FALSE) reads <- read.table("data/tung/reads.txt", sep = "\\t") anno <- read.table("data/tung/annotation.txt", sep = "\\t", header = TRUE) head(reads[ , 1:3]) ## NA19098.r1.A01 NA19098.r1.A02 NA19098.r1.A03 ## ENSG00000237683 0 0 0 ## ENSG00000187634 0 0 0 ## ENSG00000188976 57 140 1 ## ENSG00000187961 0 0 0 ## ENSG00000187583 0 0 0 ## ENSG00000187642 0 0 0 head(anno) ## individual replicate well batch sample_id ## 1 NA19098 r1 A01 NA19098.r1 NA19098.r1.A01 ## 2 NA19098 r1 A02 NA19098.r1 NA19098.r1.A02 ## 3 NA19098 r1 A03 NA19098.r1 NA19098.r1.A03 ## 4 NA19098 r1 A04 NA19098.r1 NA19098.r1.A04 ## 5 NA19098 r1 A05 NA19098.r1 NA19098.r1.A05 ## 6 NA19098 r1 A06 NA19098.r1 NA19098.r1.A06 reads <- SingleCellExperiment( assays = list(counts = as.matrix(reads)), colData = anno ) keep_feature <- rowSums(counts(reads) > 0) > 0 reads <- reads[keep_feature, ] isSpike(reads, "ERCC") <- grepl("^ERCC-", rownames(reads)) isSpike(reads, "MT") <- rownames(reads) %in% c("ENSG00000198899", "ENSG00000198727", "ENSG00000198888", "ENSG00000198886", "ENSG00000212907", "ENSG00000198786", "ENSG00000198695", "ENSG00000198712", "ENSG00000198804", "ENSG00000198763", "ENSG00000228253", "ENSG00000198938", "ENSG00000198840") reads <- calculateQCMetrics( reads, feature_controls = list( ERCC = isSpike(reads, "ERCC"), MT = isSpike(reads, "MT") ) ) ## Warning in calculateQCMetrics(reads, feature_controls = list(ERCC = ## isSpike(reads, : spike-in set 'ERCC' overwritten by feature_controls set of ## the same name hist( reads$total_counts, breaks = 100 ) abline(v = 1.3e6, col = "red") Figure 6.7: Histogram of library sizes for all cells filter_by_total_counts <- (reads$total_counts > 1.3e6) table(filter_by_total_counts) ## filter_by_total_counts ## FALSE TRUE ## 180 684 hist( reads$total_features_by_counts, breaks = 100 ) abline(v = 7000, col = "red") Figure 6.8: Histogram of the number of detected genes in all cells filter_by_expr_features <- (reads$total_features_by_counts > 7000) table(filter_by_expr_features) ## filter_by_expr_features ## FALSE TRUE ## 116 748 plotColData( reads, x = "total_features_by_counts", y = "pct_counts_MT", colour = "batch" ) Figure 6.9: Percentage of counts in MT genes plotColData( reads, x = "total_features_by_counts", y = "pct_counts_ERCC", colour = "batch" ) Figure 6.10: Percentage of counts in ERCCs filter_by_ERCC <- reads$batch != "NA19098.r2" & reads$pct_counts_ERCC < 25 table(filter_by_ERCC) ## filter_by_ERCC ## FALSE TRUE ## 103 761 filter_by_MT <- reads$pct_counts_MT < 30 table(filter_by_MT) ## filter_by_MT ## FALSE TRUE ## 18 846 reads$use <- ( # sufficient features (genes) filter_by_expr_features & # sufficient molecules counted filter_by_total_counts & # sufficient endogenous RNA filter_by_ERCC & # remove cells with unusual number of reads in MT genes filter_by_MT ) table(reads$use) ## ## FALSE TRUE ## 258 606 reads <- runPCA( reads, use_coldata = TRUE, detect_outliers = TRUE ) reducedDimNames(reads) ## [1] "PCA_coldata" table(reads$outlier) ## ## FALSE TRUE ## 753 111 plotReducedDim( reads, use_dimred = "PCA_coldata", size_by = "total_features_by_counts", shape_by = "use", colour_by = "outlier" ) library(limma) ## ## Attaching package: 'limma' ## The following object is masked from 'package:scater': ## ## plotMDS ## The following object is masked from 'package:BiocGenerics': ## ## plotMA auto <- colnames(reads)[reads$outlier] man <- colnames(reads)[!reads$use] venn.diag <- vennCounts( cbind(colnames(reads) %in% auto, colnames(reads) %in% man) ) vennDiagram( venn.diag, names = c("Automatic", "Manual"), circle.col = c("blue", "green") ) Figure 6.11: Comparison of the default, automatic and manual cell filters plotHighestExprs(reads, exprs_values = "counts") Figure 6.12: Number of total counts consumed by the top 50 expressed genes keep_feature <- nexprs( reads[,colData(reads)$use], byrow = TRUE, detection_limit = 1 ) >= 2 rowData(reads)$use <- keep_feature table(keep_feature) ## keep_feature ## FALSE TRUE ## 2664 16062 dim(reads[rowData(reads)$use, colData(reads)$use]) ## [1] 16062 606 assay(reads, "logcounts_raw") <- log2(counts(reads) + 1) reducedDim(reads) <- NULL saveRDS(reads, file = "data/tung/reads.rds") By comparing Figure 6.5 and Figure 6.11, it is clear that the reads based filtering removed more cells than the UMI based analysis. If you go back and compare the results you should be able to conclude that the ERCC and MT filters are more strict for the reads-based analysis. sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] limma_3.40.6 scater_1.12.2 ## [3] ggplot2_3.2.1 SingleCellExperiment_1.6.0 ## [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 ## [7] BiocParallel_1.18.1 matrixStats_0.55.0 ## [9] Biobase_2.44.0 GenomicRanges_1.36.1 ## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 ## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 ## ## loaded via a namespace (and not attached): ## [1] ggbeeswarm_0.6.0 colorspace_1.4-1 ## [3] mvoutlier_2.0.9 class_7.3-15 ## [5] modeltools_0.2-22 rio_0.5.16 ## [7] mclust_5.4.5 XVector_0.24.0 ## [9] pls_2.7-1 BiocNeighbors_1.2.0 ## [11] cvTools_0.3.2 flexmix_2.3-15 ## [13] mvtnorm_1.0-11 ranger_0.11.2 ## [15] splines_3.6.0 sROC_0.1-2 ## [17] robustbase_0.93-5 knitr_1.25 ## [19] zeallot_0.1.0 robCompositions_2.1.0 ## [21] kernlab_0.9-27 cluster_2.1.0 ## [23] rrcov_1.4-7 compiler_3.6.0 ## [25] backports_1.1.4 assertthat_0.2.1 ## [27] Matrix_1.2-17 lazyeval_0.2.2 ## [29] BiocSingular_1.0.0 htmltools_0.3.6 ## [31] tools_3.6.0 rsvd_1.0.2 ## [33] gtable_0.3.0 glue_1.3.1 ## [35] GenomeInfoDbData_1.2.1 dplyr_0.8.3 ## [37] Rcpp_1.0.2 carData_3.0-2 ## [39] cellranger_1.1.0 zCompositions_1.3.2-1 ## [41] vctrs_0.2.0 sgeostat_1.0-27 ## [43] fpc_2.2-3 DelayedMatrixStats_1.6.1 ## [45] lmtest_0.9-37 xfun_0.9 ## [47] laeken_0.5.0 stringr_1.4.0 ## [49] openxlsx_4.1.0.1 lifecycle_0.1.0 ## [51] irlba_2.3.3 DEoptimR_1.0-8 ## [53] zlibbioc_1.30.0 MASS_7.3-51.1 ## [55] zoo_1.8-6 scales_1.0.0 ## [57] VIM_4.8.0 hms_0.5.1 ## [59] RColorBrewer_1.1-2 yaml_2.2.0 ## [61] curl_4.2 NADA_1.6-1 ## [63] gridExtra_2.3 reshape_0.8.8 ## [65] stringi_1.4.3 highr_0.8 ## [67] pcaPP_1.9-73 e1071_1.7-2 ## [69] boot_1.3-20 zip_2.0.4 ## [71] truncnorm_1.0-8 rlang_0.4.0 ## [73] pkgconfig_2.0.3 prabclus_2.3-1 ## [75] bitops_1.0-6 evaluate_0.14 ## [77] lattice_0.20-38 purrr_0.3.2 ## [79] labeling_0.3 cowplot_1.0.0 ## [81] tidyselect_0.2.5 GGally_1.4.0 ## [83] plyr_1.8.4 magrittr_1.5 ## [85] bookdown_0.13 R6_2.4.0 ## [87] pillar_1.4.2 haven_2.1.1 ## [89] foreign_0.8-70 withr_2.1.2 ## [91] survival_2.43-3 abind_1.4-5 ## [93] RCurl_1.95-4.12 sp_1.3-1 ## [95] nnet_7.3-12 tibble_2.1.3 ## [97] crayon_1.3.4 car_3.0-3 ## [99] rmarkdown_1.15 viridis_0.5.1 ## [101] grid_3.6.0 readxl_1.3.1 ## [103] data.table_1.12.2 forcats_0.4.0 ## [105] diptest_0.75-7 vcd_1.4-4 ## [107] digest_0.6.21 tidyr_1.0.0 ## [109] munsell_0.5.0 beeswarm_0.2.3 ## [111] viridisLite_0.3.0 vipor_0.4.5 6.6 Data visualization and exploratory data analysis 6.6.1 Introduction In this chapter we will continue to work with the filtered Tung dataset produced in the previous chapter. We will explore different ways of visualizing the data to allow you to asses what happened to the expression matrix after the quality control step. scater package provides several very useful functions to simplify visualisation. One important aspect of single-cell RNA-seq is to control for batch effects. Batch effects are technical artefacts that are added to the samples during handling. For example, if two sets of samples were prepared in different labs or even on different days in the same lab, then we may observe greater similarities between the samples that were handled together. In the worst case scenario, batch effects may be mistaken for true biological variation. Data visualisation can help to identify batch effects or other unwanted sources of variation that affect our observed gene expression measurements. The Tung data allows us to explore these issues in a controlled manner since some of the salient aspects of how the samples were handled have been recorded. Ideally, we expect to see batches from the same individual grouping together and distinct groups corresponding to each individual. Data visualisation and exploratory data analysis are invaluable for allowing us to get a “feel†for a dataset. This is an area of data analysis that is perhaps more art than science, but is a crucial aspect of single-cell QC and analysis. library(SingleCellExperiment) library(scater) options(stringsAsFactors = FALSE) umi <- readRDS("data/tung/umi.rds") umi.qc <- umi[rowData(umi)$use, colData(umi)$use] endog_genes <- !rowData(umi.qc)$is_feature_control 6.6.2 PCA plot The easiest way to overview the data is by transforming it using the principal component analysis and then visualize the first two principal components. Principal component analysis (PCA) is a statistical procedure that uses a transformation to convert a set of observations into a set of values of linearly uncorrelated variables called principal components (PCs). The number of principal components is less than or equal to the number of original variables. Mathematically, the PCs correspond to the eigenvectors of the covariance matrix. The eigenvectors are sorted by eigenvalue so that the first principal component accounts for as much of the variability in the data as possible, and each succeeding component in turn has the highest variance possible under the constraint that it is orthogonal to the preceding components (the figure below is taken from here). Figure 6.13: Schematic representation of PCA dimensionality reduction 6.6.2.1 Before QC Without log-transformation: tmp <- runPCA( umi[endog_genes, ], exprs_values = "counts" ) plotPCA( tmp, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) Figure 6.14: PCA plot of the tung data With log-transformation: tmp <- runPCA( umi[endog_genes, ], exprs_values = "logcounts_raw" ) plotPCA( tmp, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) Figure 6.15: PCA plot of the tung data Clearly log-transformation is benefitial for our data - it reduces the variance on the first principal component and already separates some biological effects. Moreover, it makes the distribution of the expression values more normal. In the following analysis and chapters we will be using log-transformed raw counts by default. However, note that just a log-transformation is not enough to account for different technical factors between the cells (e.g. sequencing depth). Therefore, please do not use logcounts_raw for your downstream analysis, instead as a minimum suitable data use the logcounts slot of the SingleCellExperiment object, which not just log-transformed, but also normalised by library size (e.g. CPM normalisation). In the course we use logcounts_raw only for demonstration purposes! 6.6.2.2 After QC tmp <- runPCA( umi.qc[endog_genes, ], exprs_values = "logcounts_raw" ) plotPCA( tmp, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) Figure 6.16: PCA plot of the tung data Comparing Figure 6.15 and Figure 6.16, it is clear that after quality control the NA19098.r2 cells no longer form a group of outliers. By default only the top 500 most variable genes are used by scater to calculate the PCA. This can be adjusted by changing the ntop argument. Exercise 1 How do the PCA plots change if when all 14,066 genes are used? Or when only top 50 genes are used? Why does the fraction of variance accounted for by the first PC change so dramatically? Hint Use ntop argument of the plotPCA function. Our answer Figure 6.17: PCA plot of the tung data (14214 genes) Figure 6.18: PCA plot of the tung data (50 genes) If your answers are different please compare your code with ours (you need to search for this exercise in the opened file). 6.6.3 tSNE map An alternative to PCA for visualizing scRNA-seq data is a tSNE plot. tSNE (t-Distributed Stochastic Neighbor Embedding) converts high-dimensional Euclidean distances between datapoints into conditional probabilities that represent similarities, to produce a low-dimensional representation of high-dimensional data that displays large- and local-scale structure in the dataset. Here, we map high dimensional data ( i.e. our 14,214 dimensional expression matrix) to a 2-dimensional space while preserving local distances between cells. tSNE is almost always used to produce a two-dimensional representation of a high-dimensional dataset; it is only rarely used to generate a reduced-dimension space with more than two dimensions and is typically used only for visulisation as opposed being used as a general dimension-reduction method. Due to the non-linear and stochastic nature of the algorithm, tSNE is more difficult to intuitively interpret than a standard dimensionality reduction method such as PCA. Things to be aware of when using tSNE: tSNE has a tendency to (visually) cluster points; as such, it often creates attractive plots of datasets with distinct cell types, but does look as good when there are continuous changes in the cell population. The hyperparameters really matter: in particular, changing the perplexity parameter can have a large effect on the visulisation produced. Perplexity is a measure of information, but can loosely be thought of as a tuning parameter that controls the number of nearest neighbous for each datapoint. Cluster sizes in a tSNE plot mean nothing. Distances between clusters might not mean anything. Random noise doesn’t always look random. You can see some shapes, sometimes. For more details about how to use tSNE effectively, see this exellent article. In contrast with PCA, tSNE is a stochastic algorithm which means running the method multiple times on the same dataset will result in different plots. To ensure reproducibility, we fix the “seed†of the random-number generator in the code below so that we always get the same plot. 6.6.3.1 Before QC set.seed(123456) tmp <- runTSNE( umi[endog_genes, ], exprs_values = "logcounts_raw", perplexity = 130 ) plotTSNE( tmp, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) Figure 6.19: tSNE map of the tung data 6.6.3.2 After QC set.seed(123456) tmp <- runTSNE( umi.qc[endog_genes, ], exprs_values = "logcounts_raw", perplexity = 130 ) plotTSNE( tmp, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) Figure 6.20: tSNE map of the tung data Interpreting PCA and tSNE plots is often challenging and due to their stochastic and non-linear nature, they are less intuitive. However, in this case it is clear that they provide a similar picture of the data. Comparing Figure 6.19 and 6.20, it is again clear that the samples from NA19098.r2 are no longer outliers after the QC filtering. Furthermore tSNE requires you to provide a value of perplexity which reflects the number of neighbours used to build the nearest-neighbour network; a high value creates a dense network which clumps cells together while a low value makes the network more sparse allowing groups of cells to separate from each other. scater uses a default perplexity of the total number of cells divided by five (rounded down). You can read more about the pitfalls of using tSNE here. UMAP (Uniform Manifold Approximation and Projection) is a newer alternative to tSNE which also often creates attractive visualisations of scRNA-seq data with the benefit of being faster than tSNE to compute and is a “true†dimensionality reduction method. We will look at PCA, tSNE and UMAP plots in subsequent chapters and discuss the topic of dimensionality reduction further in the Latent spaces chapter. Exercise 2 How do the tSNE plots change when a perplexity of 10 or 200 is used? How does the choice of perplexity affect the interpretation of the results? Our answer Figure 6.21: tSNE map of the tung data (perplexity = 10) Figure 6.22: tSNE map of the tung data (perplexity = 200) 6.6.4 Big Exercise Perform the same analysis with read counts of the Blischak data. Use tung/reads.rds file to load the reads SCE object. Once you have finished please compare your results to ours (next chapter). 6.6.5 sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] scater_1.12.2 ggplot2_3.2.1 ## [3] SingleCellExperiment_1.6.0 SummarizedExperiment_1.14.1 ## [5] DelayedArray_0.10.0 BiocParallel_1.18.1 ## [7] matrixStats_0.55.0 Biobase_2.44.0 ## [9] GenomicRanges_1.36.1 GenomeInfoDb_1.20.0 ## [11] IRanges_2.18.3 S4Vectors_0.22.1 ## [13] BiocGenerics_0.30.0 ## ## loaded via a namespace (and not attached): ## [1] Rcpp_1.0.2 rsvd_1.0.2 ## [3] lattice_0.20-38 assertthat_0.2.1 ## [5] digest_0.6.21 R6_2.4.0 ## [7] evaluate_0.14 highr_0.8 ## [9] pillar_1.4.2 zlibbioc_1.30.0 ## [11] rlang_0.4.0 lazyeval_0.2.2 ## [13] irlba_2.3.3 Matrix_1.2-17 ## [15] rmarkdown_1.15 BiocNeighbors_1.2.0 ## [17] labeling_0.3 Rtsne_0.15 ## [19] stringr_1.4.0 RCurl_1.95-4.12 ## [21] munsell_0.5.0 compiler_3.6.0 ## [23] vipor_0.4.5 BiocSingular_1.0.0 ## [25] xfun_0.9 pkgconfig_2.0.3 ## [27] ggbeeswarm_0.6.0 htmltools_0.3.6 ## [29] tidyselect_0.2.5 tibble_2.1.3 ## [31] gridExtra_2.3 GenomeInfoDbData_1.2.1 ## [33] bookdown_0.13 viridisLite_0.3.0 ## [35] crayon_1.3.4 dplyr_0.8.3 ## [37] withr_2.1.2 bitops_1.0-6 ## [39] grid_3.6.0 gtable_0.3.0 ## [41] magrittr_1.5 scales_1.0.0 ## [43] stringi_1.4.3 XVector_0.24.0 ## [45] viridis_0.5.1 DelayedMatrixStats_1.6.1 ## [47] cowplot_1.0.0 tools_3.6.0 ## [49] glue_1.3.1 beeswarm_0.2.3 ## [51] purrr_0.3.2 yaml_2.2.0 ## [53] colorspace_1.4-1 knitr_1.25 6.7 Exercise: Data visualization (Reads) library(scater) options(stringsAsFactors = FALSE) reads <- readRDS("data/tung/reads.rds") reads.qc <- reads[rowData(reads)$use, colData(reads)$use] endog_genes <- !rowData(reads.qc)$is_feature_control tmp <- runPCA( reads[endog_genes, ], exprs_values = "counts" ) plotPCA( tmp, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) Figure 6.23: PCA plot of the tung data tmp <- runPCA( reads[endog_genes, ], exprs_values = "logcounts_raw" ) plotPCA( tmp, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) Figure 6.24: PCA plot of the tung data tmp <- runPCA( reads.qc[endog_genes, ], exprs_values = "logcounts_raw" ) plotPCA( tmp, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) Figure 6.25: PCA plot of the tung data set.seed(123456) tmp <- runTSNE( reads[endog_genes, ], exprs_values = "logcounts_raw", perplexity = 130 ) plotTSNE( tmp, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) Figure 6.26: tSNE map of the tung data set.seed(123456) tmp <- runTSNE( reads.qc[endog_genes, ], exprs_values = "logcounts_raw", perplexity = 130 ) plotTSNE( tmp, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) Figure 6.27: tSNE map of the tung data Figure 6.21: tSNE map of the tung data (perplexity = 10) Figure 6.22: tSNE map of the tung data (perplexity = 200) sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] scater_1.12.2 ggplot2_3.2.1 ## [3] SingleCellExperiment_1.6.0 SummarizedExperiment_1.14.1 ## [5] DelayedArray_0.10.0 BiocParallel_1.18.1 ## [7] matrixStats_0.55.0 Biobase_2.44.0 ## [9] GenomicRanges_1.36.1 GenomeInfoDb_1.20.0 ## [11] IRanges_2.18.3 S4Vectors_0.22.1 ## [13] BiocGenerics_0.30.0 ## ## loaded via a namespace (and not attached): ## [1] Rcpp_1.0.2 rsvd_1.0.2 ## [3] lattice_0.20-38 assertthat_0.2.1 ## [5] digest_0.6.21 R6_2.4.0 ## [7] evaluate_0.14 highr_0.8 ## [9] pillar_1.4.2 zlibbioc_1.30.0 ## [11] rlang_0.4.0 lazyeval_0.2.2 ## [13] irlba_2.3.3 Matrix_1.2-17 ## [15] rmarkdown_1.15 BiocNeighbors_1.2.0 ## [17] labeling_0.3 Rtsne_0.15 ## [19] stringr_1.4.0 RCurl_1.95-4.12 ## [21] munsell_0.5.0 compiler_3.6.0 ## [23] vipor_0.4.5 BiocSingular_1.0.0 ## [25] xfun_0.9 pkgconfig_2.0.3 ## [27] ggbeeswarm_0.6.0 htmltools_0.3.6 ## [29] tidyselect_0.2.5 tibble_2.1.3 ## [31] gridExtra_2.3 GenomeInfoDbData_1.2.1 ## [33] bookdown_0.13 viridisLite_0.3.0 ## [35] crayon_1.3.4 dplyr_0.8.3 ## [37] withr_2.1.2 bitops_1.0-6 ## [39] grid_3.6.0 gtable_0.3.0 ## [41] magrittr_1.5 scales_1.0.0 ## [43] stringi_1.4.3 XVector_0.24.0 ## [45] viridis_0.5.1 DelayedMatrixStats_1.6.1 ## [47] cowplot_1.0.0 tools_3.6.0 ## [49] glue_1.3.1 beeswarm_0.2.3 ## [51] purrr_0.3.2 yaml_2.2.0 ## [53] colorspace_1.4-1 knitr_1.25 References "], +["normalization-confounders-and-batch-correction.html", "7 Normalization, confounders and batch correction 7.1 Normalization theory 7.2 Normalization practice (UMI) 7.3 Normalization practice (Reads) 7.4 Identifying confounding factors 7.5 Identifying confounding factors (Reads) 7.6 Batch effects 7.7 Dealing with confounders (Reads) 7.8 Feature Selection", " 7 Normalization, confounders and batch correction 7.1 Normalization theory 7.1.1 Introduction In this chapter, we will explore approaches to normalization, confounder identification and batch correction for scRNA-seq data. Even in the absence of specific confounding factors, thoughtful normalization of scRNA-seq data is required. The raw count values are not directly comparable between cells, because in general the sequencing depth (number of reads obtained; often called library size) is very different across cells—orders-of-magnitude differences in sequencing depth are commonly observed between cells in an scRNA-seq dataset. If ignored, or not handled correctly, library size differences can be the dominant source of variation between single-cell gene expression profiles, obscuring the biological signal of interest. n Related to library size, differences in library composition can also cause problems when we are trying to compare expression profiles between cells. Normalization can and should also account for differences in library composition. In addition to normalization, it is also useful to identify confounding factors so that they can be accounted for in downstream analyses. In many cases, “accounting†for confounding variables may involve incorporating them as variables in a particular statistical model (e.g. in a differential expression model). In other cases, it may be desirable to “regress out†(either in a literal or figurative sense) confounding factors—the challenge for scRNA-seq data is finding the right model and/or data transformation such that regressing out confounding factors would work as desired. We discuss this further below. The issue of batch effects is just as important for scRNA-seq data as it is in other areas of genomics. Briefly, scRNA-seq and other ’omics assays are sensitive to minor differences in technical features of data generation. As such, even when assaying the same experimental or biological system, measurements taken at difference times and places or by different people will differ substantially. To make valid comparisons between cells, samples or groups, we first need to design our studies to be robust to batch effects and then we need to treat batch effects appropriately in our analyses. In the following sections, we will explore simple size-factor normalizations correcting for library size and composition and also discuss a more recent, conceptually quite different, approach to tackling the problem of library size differences between cells. 7.1.2 Library size Library sizes vary because scRNA-seq data is often sequenced on highly multiplexed platforms the total reads which are derived from each cell may differ substantially. Most scRNA-seq platforms and/or quantification methods currently available produce count values as the “rawâ€, “observedâ€, gene expression values. For such count data, the library size must be corrected for as part of data normalization. One popular strategy, borrowed and extended from the analysis of bulk RNA-seq data, is to multiply or divide each column of the expression matrix (in our setup columns correspond to cells) by a “normalization factor†which is an estimate of the library size relative to the other cells. Many methods to correct for library size have been developed for bulk RNA-seq and can be equally applied to scRNA-seq (eg. UQ, SF, CPM, RPKM, FPKM, TPM). In addition, single-cell specific size-factor normalization methods have been proposed to better handle the characteristics of scRNA-seq data (namely greater sparsity/proportion of zero counts). We will demonstrate use of the size-factor normalization method from the scran package in this chapter. A conceptually different approach to normalization of scRNA-seq data was proposed earlier in 2019 by (Hafemeister and Satija 2019). The idea behind the sctransform approach is to fit a regularized negative binomial model to the raw count data, with library size as the only explanatory variable in the model. The residuals from this model can then be used as normalized and variance-stabilized expression values. We show the use of this method too in this chapter. Some quantification methods (particularly those that quantify transcript-level expression, e.g. Salmon, kallisto) return transcripts-per-million values, TPM (instead of or in addition to count values), which effectively incorporate library size when determining gene expression estimates and thus do not require subsequent normalization for library size. However, TPM values may still be susceptible to library composition biases and so normalization may still be required. 7.1.3 Scaling or size-factor normalization methods The normalization methods discussed in this section all involve dividing the counts for each cell by a constant value to account for library size and, in some cases, library composition. These methods will typically give (adjusted/normalized) counts-per-million (CPM) or transcripts-per-million (TPM) values. Ideally, after applying one of these scaling/size-factor normalization methods, the CPM/TPM values produced are comparable across cells, with the effects of sequencing depth removed. However, even if this is true (i.e. the normalization has worked well), the CPM/TPM values do not have stable variance. Specifically, as the size of the values increases, so does the variance. This feature of the data (heteroskedacity, or asymmetric, heavy-tailed distributions) is problematic for statistical analysis methods that assume homoskedacity, that is that there is no relationship between the mean of expression values and their variance (i.e. just about anything that uses a Gaussian error model). As such, we should apply a variance stabilizing transformation to these data so that we can use standard statistical methods like linear regression and PCA with confidence. Developing a thoroughly effective variance stabilizing transformation is a challenge, so almost universally a log transformation (typically log2) is applied to the CPM/TPM values (the logcounts slot in a SingleCellExperiment object is expected to contain (normalized) log2-scale CPM/TPM values). For high-depth cells and highly-expressed genes this transformation generally works well (as for bulk RNA-seq data), but, as we will discuss below, it often performs sub-optimally for (sparse) scRNA-seq data. 7.1.3.1 CPM The simplest way to normalize this data is to convert it to counts per million (CPM) by dividing each column by its total then multiplying by 1,000,000. Note that spike-ins should be excluded from the calculation of total expression in order to correct for total cell RNA content, therefore we will only use endogenous genes. Example of a CPM function in R (using the scater package): calc_cpm <- function (expr_mat, spikes = NULL) { norm_factor <- colSums(expr_mat[-spikes, ]) return(t(t(expr_mat)/norm_factor)) * 10^6 } One potential drawback of CPM is if your sample contains genes that are both very highly expressed and differentially expressed across the cells. In this case, the total molecules in the cell may depend of whether such genes are on/off in the cell and normalizing by total molecules may hide the differential expression of those genes and/or falsely create differential expression for the remaining genes. Note RPKM, FPKM and TPM are variants on CPM which further adjust counts by the length of the respective gene/transcript. TPM is usually a direct output of a transcript expression quantification method (e.g. Salmon, kallisto, etc). To deal with this potentiality several other measures were devised. 7.1.3.2 RLE (SF) The size factor (SF) was proposed and popularized by DESeq (Anders and Huber 2010). First the geometric mean of each gene across all cells is calculated. The size factor for each cell is the median across genes of the ratio of the expression to the gene’s geometric mean. A drawback to this method is that since it uses the geometric mean only genes with non-zero expression values across all cells can be used in its calculation, making it unadvisable for large low-depth scRNASeq experiments. edgeR & scater call this method RLE for “relative log expression†(to distinguish it from the many other size-factor normalization methods that now exist). Example of a SF function in R (from the edgeR package): calc_sf <- function (expr_mat, spikes = NULL) { geomeans <- exp(rowMeans(log(expr_mat[-spikes, ]))) SF <- function(cnts) { median((cnts/geomeans)[(is.finite(geomeans) & geomeans > 0)]) } norm_factor <- apply(expr_mat[-spikes, ], 2, SF) return(t(t(expr_mat)/norm_factor)) } 7.1.3.3 UQ The upperquartile (UQ) was proposed by (Bullard et al. 2010). Here each column is divided by the 75% quantile of the counts for each library. Often the calculated quantile is scaled by the median across cells to keep the absolute level of expression relatively consistent. A drawback to this method is that for low-depth scRNASeq experiments the large number of undetected genes may result in the 75% quantile being zero (or close to it). This limitation can be overcome by generalizing the idea and using a higher quantile (eg. the 99% quantile is the default in scater) or by excluding zeros prior to calculating the 75% quantile. Example of a UQ function in R (again from the edgeR package): calc_uq <- function (expr_mat, spikes = NULL) { UQ <- function(x) { quantile(x[x > 0], 0.75) } uq <- unlist(apply(expr_mat[-spikes, ], 2, UQ)) norm_factor <- uq/median(uq) return(t(t(expr_mat)/norm_factor)) } 7.1.3.4 TMM Another method is called TMM is the weighted trimmed mean of M-values (to the reference) proposed by (Robinson and Oshlack 2010). The M-values in question are the gene-wise log2-fold changes between individual cells. One cell is used as the reference then the M-values for each other cell is calculated compared to this reference. These values are then trimmed by removing the top and bottom ~30%, and the average of the remaining values is calculated by weighting them to account for the effect of the log scale on variance. Each non-reference cell is multiplied by the calculated factor. Two potential issues with this method are insufficient non-zero genes left after trimming, and the assumption that most genes are not differentially expressed. sizeFactors(umi.qc) <- edgeR::calcNormFactors(counts(umi.qc), method = "TMM") 7.1.3.5 scran scran package implements a variant on CPM size-factor normalization specialized for single-cell data (L. Lun, Bach, and Marioni 2016). Briefly this method deals with the problem of vary large numbers of zero values per cell by pooling cells together calculating a normalization factor (similar to TMM) for the sum of each pool. Since each cell is found in many different pools, cell-specific factors can be deconvoluted from the collection of pool-specific factors using linear algebra. This method applies a “quick cluster†method to get rough clusters of cells to pool together to apply the strategy outlined above. qclust <- quickCluster(umi.qc, min.size = 30) umi.qc <- computeSumFactors(umi.qc, sizes = 15, clusters = qclust) 7.1.4 sctransform The sctransform method is very different from the scaling/size-factor methods discussed above. In their paper, (Hafemeister and Satija 2019) argue that the log-transformation of (normalized) CPM values does not stabilise the variance of expression values, particularly in the case of sparse(r) UMI-count data. Figure 1 of their paper (reproduced below) sets out this argument that strong relationships exist between gene expression and total cell UMI count, even after applying a scaled log-normalization method. Figure 2.4: Reproduction of Figure 1 from Hafemeister and Satija (2019). 33,148 PBMC dataset from 10x genomics. A) Distribution of total UMI counts / cell (’sequencing depth’). B) We placed genesinto six groups, based on their average expression in the dataset. C) For each gene group, we examined the average relationship between observed counts and cell sequencing depth. We fit a smooth line for each gene individually and combined results based on the groupings in (B). Black line shows mean, colored region indicates interquartile range. D) Same as in (C), but showing scaled log-normalized values instead of UMI counts. Values were scaled (z-scored) so that a single y-axis range could be used. E) Relationship between gene variance and cell sequencing depth; Cells were placed into five equal-sized groups based on total UMI counts (group 1 has the greatest depth), and we calculated the total variance of each gene group within each bin. For effectively normalized data, each cell bin should contribute 20% to the variance of each gene group. One effect of the failure of the scaled log-normalization to remove the relationship between total cell UMI count and expression is that dimension-reduction methods (especially PCA) applied to the log-normalized data can return reduced dimension spaces where, very often, the first dimension is highly correlated with total cell UMI count to total cell genes expressed. This effect is noted and discussed by (William Townes et al. 2019). The sctransform solution is to fit a negative binomial (NB) generalized linear model to the UMI counts for each gene, with an intercept term and a coefficient for library size (specifically using log10(total cell UMI count) as a covariate) as parameters in the model. The negative binomial model can account for much more variance in the observed count data than a simpler model like the Poisson can. To avoid overfitting the model to the data, the gene-wise intercept, library size and overdispersion parameters are regularized by fitting a loess (locally-linear smoothing method) to the per-gene estimates from the GLM. Figure 2.5: Reproduction of Figure 2A from Hafemeister and Satija (2019). They fit NB regression models for each gene individually, and bootstrapped the process to measure uncertainty in the resulting parameter estimates. A) Model parameters for 16,809 genes for the NB regression model, plotted as a function of average gene abundance. The color of each point indicates a parameter uncertainty score as determined by bootstrapping (Methods). Pink line shows the regularized parameters obtained via kernel regression. The regularized NB GLM is presented as an attractive middle ground between the (underfit) Poisson model and the (overfit) unregularized NB model. The Pearson residuals from the regularized NB GLM are used as “normalized†expression values for downstream analyses. Figure 2.6: Reproduction of Figure 4 from Hafemeister and Satija (2019). A) For four genes, we show the relationship between cell sequencing depth and molecular counts. White points show the observed data. Background color represents the Pearson residual magnitude under three error models. For MALAT1 (does not vary across cell types) the Poisson error model does not account for overdispersion, and incorrectly infers significant residual variation (biological heterogeneity). For S100A9 (a CD14+ Monocyte marker) and CD74 (expressed in antigen-presenting cells) the non-regularized NB model overfits the data, and collapses biological heterogeneity. For PPBP (a Megakaryocyte marker) both non-regularized models wrongly fit a negative slope. B) Boxplot of Pearson residuals for models shown in A. X-axis range shown is limited to [-8, 25] for visual clarity. The regularized NB GLM also provides a natural way to do feature selection ( i.e. find informative genes) using the deviance of the fitted GLM for each gene. We discuss this further in the Feature Selection section. We find the Pearson residuals from sctransform to be highly suitable as input to visualisation (dimension reduction) and clustering methods. For several other analyses (e.g. differential expression analyses), where statistical models designed for sparse count data are available, we prefer to use approaches that work with the “raw†count data. We are not yet sure how well sctransform performs on full-length transcript (i.e. non-UMI) count data. 7.1.5 Downsampling A final way to correct for library size is to downsample the expression matrix so that each cell has approximately the same total number of molecules. The benefit of this method is that zero values will be introduced by the down sampling thus eliminating any biases due to differing numbers of detected genes. However, the major drawback is that the process is not deterministic so each time the downsampling is run the resulting expression matrix is slightly different. Thus, often analyses must be run on multiple downsamplings to ensure results are robust. Downsampling to the depth of the cell with the lowest sequencing depth (that still passes QC) will typically discard much (most) of the information gathered in a (typically expensive) scRNA-seq experiment. We view this as a heavy price to pay for a normalization method that generally does not seem to outperform alternatives. Thus, we would not recommend downsampling as a normalization strategy for scRNA-seq data unless all alternatives have failed. 7.1.6 Effectiveness To compare the efficiency of different normalization methods we will use visual inspection of PCA plots and calculation of cell-wise relative log expression via scater’s plotRLE() function. Namely, cells with many (few) reads have higher (lower) than median expression for most genes resulting in a positive (negative) RLE across the cell, whereas normalized cells have an RLE close to zero. Example of a RLE function in R: calc_cell_RLE <- function (expr_mat, spikes = NULL) { RLE_gene <- function(x) { if (median(unlist(x)) > 0) { log((x + 1)/(median(unlist(x)) + 1))/log(2) } else { rep(NA, times = length(x)) } } if (!is.null(spikes)) { RLE_matrix <- t(apply(expr_mat[-spikes, ], 1, RLE_gene)) } else { RLE_matrix <- t(apply(expr_mat, 1, RLE_gene)) } cell_RLE <- apply(RLE_matrix, 2, median, na.rm = T) return(cell_RLE) } Note The RLE, TMM, and UQ size-factor methods were developed for bulk RNA-seq data and, depending on the experimental context, may not be appropriate for single-cell RNA-seq data, as their underlying assumptions may be problematically violated. Note The calcNormFactors function from the edgeR package implements several library size normalization methods making it easy to apply any of these methods to our data. Note edgeR makes extra adjustments to some of the normalization methods which may result in somewhat different results than if the original methods are followed exactly, e.g. edgeR’s and scater’s “RLE†method which is based on the “size factor†used by DESeq may give different results to the estimateSizeFactorsForMatrix method in the DESeq/DESeq2 packages. In addition, some (earlier) versions of edgeR will not calculate the normalization factors correctly unless lib.size is set at 1 for all cells. Note For CPM normalisation we use scater’s calculateCPM() function. For RLE, UQ and TMM we used to use scater’s normaliseExprs() function, but it was deprecated and has been removed from the package). For scran we use the scran package to calculate size factors (it also operates on SingleCellExperiment class) and scater’s normalize() to normalise the data. All these normalization functions save the results to the logcounts slot of the SCE object. 7.2 Normalization practice (UMI) We will continue to work with the tung data that was used in the previous chapter. library(scRNA.seq.funcs) library(scater) library(scran) options(stringsAsFactors = FALSE) set.seed(1234567) umi <- readRDS("data/tung/umi.rds") umi.qc <- umi[rowData(umi)$use, colData(umi)$use] endog_genes <- !rowData(umi.qc)$is_feature_control 7.2.1 Raw tmp <- runPCA( umi.qc[endog_genes, ], exprs_values = "logcounts_raw" ) plotPCA( tmp, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) + ggtitle("PCA plot: raw log-counts") Figure 7.1: PCA plot of the tung data 7.2.2 CPM logcounts(umi.qc) <- log2(calculateCPM(umi.qc, use_size_factors = FALSE) + 1) plotPCA( umi.qc[endog_genes, ], colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) + ggtitle("PCA plot: log2(CPM) values") Figure 7.2: PCA plot of the tung data after CPM normalisation plotRLE( umi.qc[endog_genes, ], exprs_values = "logcounts_raw", colour_by = "batch" ) + ggtitle("RLE plot: raw log-counts") Figure 7.3: Cell-wise RLE of the tung data. The relative log expression profile of each cell is represented by a boxplot, which appears as a line here. The grey bar in the middle for each cell represent the interquartile range of the RLE values; the coloured lines represent the whiskers ofof a boxplot and extend above and below the grey bar by 1.5 times the interquartile range. The median RLE value is shown with a circle. plotRLE( umi.qc[endog_genes, ], exprs_values = "logcounts", colour_by = "batch" ) + ggtitle("RLE plot: log2(CPM)") Figure 7.4: Cell-wise RLE of the tung data. The relative log expression profile of each cell is represented by a boxplot, which appears as a line here. The grey bar in the middle for each cell represent the interquartile range of the RLE values; the coloured lines represent the whiskers ofof a boxplot and extend above and below the grey bar by 1.5 times the interquartile range. The median RLE value is shown with a circle. Q: How well would you say the two approaches above normalize the data? 7.2.3 scran scran’s method for size-factor estimation will almost always be preferable for scRNA-seq data to methods that were developed for bulk RNA-seq data (TMM, RLE, UQ). Thus, we will just demonstrate the use of scran size-factor normalization here as representative of size-factor normalization more generally. The code below computes the size factors and then the normalize() function in scater applies those size factors along with the library sizes to the count matrix to produce normalized log2-counts-per-million values that are then stored in the logcounts slot of the SingleCellExperiment object. qclust <- quickCluster(umi.qc, min.size = 30, use.ranks = FALSE) umi.qc <- computeSumFactors(umi.qc, sizes = 15, clusters = qclust) umi.qc <- normalize(umi.qc) plotPCA( umi.qc[endog_genes, ], colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) + ggtitle("PCA plot: scran size-factor normalization") Figure 7.5: PCA plot of the tung data after LSF normalisation plotRLE( umi.qc[endog_genes, ], exprs_values = "logcounts", colour_by = "batch" ) + ggtitle("RLE plot: scran size-factor normalization") Figure 7.6: Cell-wise RLE of the tung data scran sometimes calculates negative or zero size factors. These will completely distort the normalized expression matrix. We can check the size factors scran has computed like so: summary(sizeFactors(umi.qc)) ## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.4836 0.7747 0.9532 1.0000 1.1483 3.2873 For this dataset all the size factors are reasonable so we are done. If you find scran has calculated negative size factors try increasing the cluster and pool sizes until they are all positive. We sometimes filter out cells with very large size-factors (you may like to think about why), but we will not demonstrate that here. 7.2.4 sctransform The sctransform approach to using Pearson residuals from an regularized negative binomial generalized linear model was introduced above. Here we demonstrate how to apply this method. Note that (due to what looks like a bug in this version of sctransform) we need to convert the UMI count matrix to a sparse format to apply sctransform. umi_sparse <- as(counts(umi.qc), "dgCMatrix") ### Genes expressed in at least 5 cells will be kept sctnorm_data <- sctransform::vst(umi = umi_sparse, min_cells = 1, cell_attr = as.data.frame(colData(umi.qc)), latent_var = "log10_total_counts_endogenous") ## | | | 0% | |======== | 12% | |================ | 25% | |======================== | 38% | |================================ | 50% | |========================================= | 62% | |================================================= | 75% | |========================================================= | 88% | |=================================================================| 100% ## | | | 0% | |= | 2% | |== | 4% | |==== | 5% | |===== | 7% | |====== | 9% | |======= | 11% | |======== | 13% | |========= | 15% | |=========== | 16% | |============ | 18% | |============= | 20% | |============== | 22% | |=============== | 24% | |================= | 25% | |================== | 27% | |=================== | 29% | |==================== | 31% | |===================== | 33% | |====================== | 35% | |======================== | 36% | |========================= | 38% | |========================== | 40% | |=========================== | 42% | |============================ | 44% | |============================== | 45% | |=============================== | 47% | |================================ | 49% | |================================= | 51% | |================================== | 53% | |=================================== | 55% | |===================================== | 56% | |====================================== | 58% | |======================================= | 60% | |======================================== | 62% | |========================================= | 64% | |=========================================== | 65% | |============================================ | 67% | |============================================= | 69% | |============================================== | 71% | |=============================================== | 73% | |================================================ | 75% | |================================================== | 76% | |=================================================== | 78% | |==================================================== | 80% | |===================================================== | 82% | |====================================================== | 84% | |======================================================== | 85% | |========================================================= | 87% | |========================================================== | 89% | |=========================================================== | 91% | |============================================================ | 93% | |============================================================= | 95% | |=============================================================== | 96% | |================================================================ | 98% | |=================================================================| 100% ## Pearson residuals, or deviance residuals dim(sctnorm_data$y) ## [1] 14066 657 dim(umi.qc) ## [1] 14066 657 sctnorm_data$model_str ## [1] "y ~ log10_total_counts_endogenous" assay(umi.qc, "sctrans_norm") <- sctnorm_data$y Let us look at the NB GLM model parameters estimated by sctransform. #sce$log10_total_counts ## Matrix of estimated model parameters per gene (theta and regression coefficients) sctransform::plot_model_pars(sctnorm_data) We can look at the effect of sctransform’s normalization on three particular genes, ACTB, POU5F1 (aka OCT4) and CD74. ##c('ACTB', 'Rpl10', 'Cd74') genes_plot <- c("ENSG00000075624", "ENSG00000204531", "ENSG00000019582") sctransform::plot_model(sctnorm_data, umi_sparse, genes_plot, plot_residual = TRUE, cell_attr = as.data.frame(colData(umi.qc))) reducedDim(umi.qc, "PCA_sctrans_norm") <- reducedDim( runPCA(umi.qc[endog_genes, ], exprs_values = "sctrans_norm") ) plotReducedDim( umi.qc, use_dimred = "PCA_sctrans_norm", colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) + ggtitle("PCA plot: sctransform normalization") Figure 7.7: PCA plot of the tung data after sctransform normalisation (Pearson residuals). plotRLE( umi.qc[endog_genes, ], exprs_values = "sctrans_norm", colour_by = "batch" ) + ggtitle("RLE plot: sctransform normalization") Figure 7.8: Cell-wise RLE of the tung data 7.2.5 Normalisation for gene/transcript length Some methods combine library size and fragment/gene length normalization such as: RPKM - Reads Per Kilobase Million (for single-end sequencing) FPKM - Fragments Per Kilobase Million (same as RPKM but for paired-end sequencing, makes sure that paired ends mapped to the same fragment are not counted twice) TPM - Transcripts Per Kilobase Million (same as RPKM, but the order of normalizations is reversed - length first and sequencing depth second) These methods are not applicable to our dataset since the end of the transcript which contains the UMI was preferentially sequenced. Furthermore in general these should only be calculated using appropriate quantification software from aligned BAM files not from read counts since often only a portion of the entire gene/transcript is sequenced, not the entire length. If in doubt check for a relationship between gene/transcript length and expression level. However, here we show how these normalisations can be calculated using scater. First, we need to find the effective transcript length in Kilobases. However, our dataset containes only gene IDs, therefore we will be using the gene lengths instead of transcripts. scater uses the biomaRt package, which allows one to annotate genes by other attributes: umi.qc <- getBMFeatureAnnos( umi.qc, filters = "ensembl_gene_id", attributes = c( "ensembl_gene_id", "hgnc_symbol", "chromosome_name", "start_position", "end_position" ), biomart = "ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl", host = "www.ensembl.org" ) # If you have mouse data, change the arguments based on this example: # getBMFeatureAnnos( # object, # filters = "ensembl_transcript_id", # attributes = c( # "ensembl_transcript_id", # "ensembl_gene_id", # "mgi_symbol", # "chromosome_name", # "transcript_biotype", # "transcript_start", # "transcript_end", # "transcript_count" # ), # biomart = "ENSEMBL_MART_ENSEMBL", # dataset = "mmusculus_gene_ensembl", # host = "www.ensembl.org" # ) Some of the genes were not annotated, therefore we filter them out: umi.qc.ann <- umi.qc[!is.na(rowData(umi.qc)$ensembl_gene_id), ] Now we compute the total gene length in Kilobases by using the end_position and start_position fields: eff_length <- abs(rowData(umi.qc.ann)$end_position - rowData(umi.qc.ann)$start_position) / 1000 plot(eff_length, rowMeans(counts(umi.qc.ann))) There is no relationship between gene length and mean expression so __FPKM__s & __TPM__s are inappropriate for this dataset. This is what we would expect for UMI protocols that tag one end of the transcript. But we will demonstrate them anyway. Note Here calculate the total gene length instead of the total exon length. Many genes will contain lots of introns so their eff_length will be very different from what we have calculated. Please consider our calculation as approximation. If you want to use the total exon lengths, please refer to this page. Now we are ready to perform the normalisations: tpm(umi.qc.ann) <- log2(calculateTPM(umi.qc.ann, eff_length) + 1) Plot the results as a PCA plot: tmp <- runPCA( umi.qc.ann, exprs_values = "tpm", ) plotPCA( tmp, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) tpm(umi.qc.ann) <- log2(calculateFPKM(umi.qc.ann, eff_length) + 1) tmp <- runPCA( umi.qc.ann, exprs_values = "tpm", ) plotPCA( tmp, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) Note The PCA looks for differences between cells. Gene length is the same across cells for each gene thus FPKM is almost identical to the CPM plot (it is just rotated) since it performs CPM first then normalizes gene length. Whereas, TPM is different because it weights genes by their length before performing __CPM_**. 7.2.6 Reflection Q: What is your assessment of the performance of these different normalization methods on the data presented here? Q: Which normalization method would you prefer for this dataset? Why? 7.2.7 Exercise Perform the same analysis with read counts of the tung data. Use tung/reads.rds file to load the reads SCE object. Once you have finished please compare your results to ours (next chapter). 7.2.8 sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] scran_1.12.1 scater_1.12.2 ## [3] ggplot2_3.2.1 SingleCellExperiment_1.6.0 ## [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 ## [7] BiocParallel_1.18.1 matrixStats_0.55.0 ## [9] Biobase_2.44.0 GenomicRanges_1.36.1 ## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 ## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 ## [15] scRNA.seq.funcs_0.1.0 ## ## loaded via a namespace (and not attached): ## [1] viridis_0.5.1 dynamicTreeCut_1.63-1 ## [3] edgeR_3.26.8 BiocSingular_1.0.0 ## [5] viridisLite_0.3.0 DelayedMatrixStats_1.6.1 ## [7] elliptic_1.4-0 moments_0.14 ## [9] assertthat_0.2.1 statmod_1.4.32 ## [11] highr_0.8 dqrng_0.2.1 ## [13] GenomeInfoDbData_1.2.1 vipor_0.4.5 ## [15] yaml_2.2.0 globals_0.12.4 ## [17] pillar_1.4.2 lattice_0.20-38 ## [19] glue_1.3.1 limma_3.40.6 ## [21] digest_0.6.21 XVector_0.24.0 ## [23] colorspace_1.4-1 plyr_1.8.4 ## [25] cowplot_1.0.0 htmltools_0.3.6 ## [27] Matrix_1.2-17 pkgconfig_2.0.3 ## [29] listenv_0.7.0 bookdown_0.13 ## [31] zlibbioc_1.30.0 purrr_0.3.2 ## [33] scales_1.0.0 Rtsne_0.15 ## [35] tibble_2.1.3 withr_2.1.2 ## [37] lazyeval_0.2.2 magrittr_1.5 ## [39] crayon_1.3.4 evaluate_0.14 ## [41] future_1.14.0 MASS_7.3-51.1 ## [43] beeswarm_0.2.3 tools_3.6.0 ## [45] stringr_1.4.0 locfit_1.5-9.1 ## [47] munsell_0.5.0 irlba_2.3.3 ## [49] orthopolynom_1.0-5 compiler_3.6.0 ## [51] rsvd_1.0.2 contfrac_1.1-12 ## [53] rlang_0.4.0 grid_3.6.0 ## [55] RCurl_1.95-4.12 BiocNeighbors_1.2.0 ## [57] igraph_1.2.4.1 labeling_0.3 ## [59] bitops_1.0-6 rmarkdown_1.15 ## [61] codetools_0.2-16 hypergeo_1.2-13 ## [63] gtable_0.3.0 deSolve_1.24 ## [65] reshape2_1.4.3 R6_2.4.0 ## [67] gridExtra_2.3 knitr_1.25 ## [69] dplyr_0.8.3 future.apply_1.3.0 ## [71] stringi_1.4.3 ggbeeswarm_0.6.0 ## [73] Rcpp_1.0.2 sctransform_0.2.0 ## [75] tidyselect_0.2.5 xfun_0.9 7.3 Normalization practice (Reads) Figure 7.9: PCA plot of the tung data Figure 7.10: PCA plot of the tung data after CPM normalisation Figure 7.11: Cell-wise RLE of the tung data Figure 7.12: Cell-wise RLE of the tung data ## Warning: Setting 'use.ranks=TRUE' for the old defaults. ## Set 'use.ranks=FALSE' for the new defaults. ## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its ## own size factors ## Warning in .get_all_sf_sets(object): spike-in set 'MT' should have its own ## size factors Figure 7.13: PCA plot of the tung data after LSF normalisation Figure 7.14: Cell-wise RLE of the tung data Figure 7.15: Cell-wise RLE of the tung data ## Calculating cell attributes for input UMI matrix ## Variance stabilizing transformation of count matrix of size 16062 by 606 ## Model formula is y ~ log10_total_counts_endogenous ## Get Negative Binomial regression parameters per gene ## Using 2000 genes, 606 cells ## | | | 0% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## | |======== | 12% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## | |================ | 25% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## | |======================== | 38% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## | |================================ | 50% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## | |========================================= | 62% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## | |================================================= | 75% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## | |========================================================= | 88% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## | |=================================================================| 100% ## Found 1 outliers - those will be ignored in fitting/regularization step ## Second step: Get residuals using fitted parameters for 16062 genes ## | | | 0% | |= | 2% | |== | 3% | |=== | 5% | |==== | 6% | |===== | 8% | |====== | 10% | |======= | 11% | |======== | 13% | |========= | 14% | |========== | 16% | |=========== | 17% | |============ | 19% | |============= | 21% | |============== | 22% | |=============== | 24% | |================= | 25% | |================== | 27% | |=================== | 29% | |==================== | 30% | |===================== | 32% | |====================== | 33% | |======================= | 35% | |======================== | 37% | |========================= | 38% | |========================== | 40% | |=========================== | 41% | |============================ | 43% | |============================= | 44% | |============================== | 46% | |=============================== | 48% | |================================ | 49% | |================================= | 51% | |================================== | 52% | |=================================== | 54% | |==================================== | 56% | |===================================== | 57% | |====================================== | 59% | |======================================= | 60% | |======================================== | 62% | |========================================= | 63% | |========================================== | 65% | |=========================================== | 67% | |============================================ | 68% | |============================================= | 70% | |============================================== | 71% | |=============================================== | 73% | |================================================ | 75% | |================================================== | 76% | |=================================================== | 78% | |==================================================== | 79% | |===================================================== | 81% | |====================================================== | 83% | |======================================================= | 84% | |======================================================== | 86% | |========================================================= | 87% | |========================================================== | 89% | |=========================================================== | 90% | |============================================================ | 92% | |============================================================= | 94% | |============================================================== | 95% | |=============================================================== | 97% | |================================================================ | 98% | |=================================================================| 100% ## Calculating gene attributes ## Wall clock passed: Time difference of 16.15881 secs ## [1] 16062 606 ## [1] 16062 606 ## [1] "y ~ log10_total_counts_endogenous" Let us look at the NB GLM model parameters estimated by sctransform. We can look at the effect of sctransform’s normalization on three particular genes, ACTB, POU5F1 (aka OCT4) and CD74. Figure 7.16: PCA plot of the tung reads data after sctransform normalisation (Pearson residuals). Figure 7.17: Cell-wise RLE of the tung reads data Figure 7.18: PCA plot of the tung data after TPM normalisation ## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its ## own size factors ## Warning in .get_all_sf_sets(object): spike-in set 'MT' should have its own ## size factors ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] scran_1.12.1 scater_1.12.2 ## [3] ggplot2_3.2.1 SingleCellExperiment_1.6.0 ## [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 ## [7] BiocParallel_1.18.1 matrixStats_0.55.0 ## [9] Biobase_2.44.0 GenomicRanges_1.36.1 ## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 ## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 ## [15] scRNA.seq.funcs_0.1.0 ## ## loaded via a namespace (and not attached): ## [1] bitops_1.0-6 bit64_0.9-7 ## [3] httr_1.4.1 progress_1.2.2 ## [5] dynamicTreeCut_1.63-1 backports_1.1.4 ## [7] sctransform_0.2.0 tools_3.6.0 ## [9] R6_2.4.0 irlba_2.3.3 ## [11] hypergeo_1.2-13 vipor_0.4.5 ## [13] DBI_1.0.0 lazyeval_0.2.2 ## [15] colorspace_1.4-1 withr_2.1.2 ## [17] prettyunits_1.0.2 tidyselect_0.2.5 ## [19] gridExtra_2.3 moments_0.14 ## [21] curl_4.2 bit_1.1-14 ## [23] compiler_3.6.0 orthopolynom_1.0-5 ## [25] BiocNeighbors_1.2.0 labeling_0.3 ## [27] bookdown_0.13 scales_1.0.0 ## [29] stringr_1.4.0 digest_0.6.21 ## [31] rmarkdown_1.15 XVector_0.24.0 ## [33] pkgconfig_2.0.3 htmltools_0.3.6 ## [35] limma_3.40.6 highr_0.8 ## [37] rlang_0.4.0 RSQLite_2.1.2 ## [39] DelayedMatrixStats_1.6.1 dplyr_0.8.3 ## [41] RCurl_1.95-4.12 magrittr_1.5 ## [43] BiocSingular_1.0.0 GenomeInfoDbData_1.2.1 ## [45] Matrix_1.2-17 Rcpp_1.0.2 ## [47] ggbeeswarm_0.6.0 munsell_0.5.0 ## [49] viridis_0.5.1 stringi_1.4.3 ## [51] yaml_2.2.0 edgeR_3.26.8 ## [53] MASS_7.3-51.1 zlibbioc_1.30.0 ## [55] Rtsne_0.15 plyr_1.8.4 ## [57] blob_1.2.0 grid_3.6.0 ## [59] listenv_0.7.0 dqrng_0.2.1 ## [61] crayon_1.3.4 contfrac_1.1-12 ## [63] lattice_0.20-38 cowplot_1.0.0 ## [65] hms_0.5.1 locfit_1.5-9.1 ## [67] zeallot_0.1.0 knitr_1.25 ## [69] pillar_1.4.2 igraph_1.2.4.1 ## [71] future.apply_1.3.0 reshape2_1.4.3 ## [73] codetools_0.2-16 biomaRt_2.40.4 ## [75] XML_3.98-1.20 glue_1.3.1 ## [77] evaluate_0.14 deSolve_1.24 ## [79] vctrs_0.2.0 gtable_0.3.0 ## [81] purrr_0.3.2 future_1.14.0 ## [83] assertthat_0.2.1 xfun_0.9 ## [85] rsvd_1.0.2 viridisLite_0.3.0 ## [87] tibble_2.1.3 elliptic_1.4-0 ## [89] memoise_1.1.0 AnnotationDbi_1.46.1 ## [91] beeswarm_0.2.3 globals_0.12.4 ## [93] statmod_1.4.32 7.4 Identifying confounding factors 7.4.1 Introduction There is a large number of potential confounders, artifacts and biases in scRNA-seq data. One of the main challenges in analysing scRNA-seq data stems from the fact that it is difficult to carry out a true technical replication (why?) to distinguish biological and technical variability. In the previous chapters we considered normalization and in this chapter we will continue to explore how experimental artifacts can be identified and removed. We will continue using the scater package since it provides a set of methods specifically for quality control of experimental and explanatory variables. Moreover, we will continue to work with the Blischak data that was used in the previous chapter. library(scater, quietly = TRUE) library(scran) options(stringsAsFactors = FALSE) umi <- readRDS("data/tung/umi.rds") umi.qc <- umi[rowData(umi)$use, colData(umi)$use] endog_genes <- !rowData(umi.qc)$is_feature_control The umi.qc dataset contains filtered cells and genes. Our next step is to explore technical drivers of variability in the data to inform data normalisation before downstream analysis. 7.4.2 Correlations with PCs Let’s first look again at the PCA plot of the QCed dataset using the scran-normalized log2-CPM values: qclust <- quickCluster(umi.qc, min.size = 30, use.ranks = FALSE) umi.qc <- computeSumFactors(umi.qc, sizes = 15, clusters = qclust) umi.qc <- normalize(umi.qc) ## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its ## own size factors ## Warning in .get_all_sf_sets(object): spike-in set 'MT' should have its own ## size factors reducedDim(umi.qc, "PCA") <- reducedDim( runPCA(umi.qc[endog_genes,], exprs_values = "logcounts", ncomponents = 10), "PCA") plotPCA( umi.qc, colour_by = "batch", size_by = "total_features_by_counts" ) Figure 7.19: PCA plot of the tung data scater allows one to identify principal components that correlate with experimental and QC variables of interest (it ranks principle components by \\(R^2\\) from a linear model regressing PC value against the variable of interest). Let’s test whether some of the variables correlate with any of the PCs. 7.4.2.1 Top colData variables associated with PCs The plot below shows, for each of the first 10 PCs, the variance explained by the ten variables in colData(umi.qc) that are most strongly associated with the PCs. [We will ignore the sample_id variable: it has a unique value for each cell, so can explain all the variation for all PCs.] plotExplanatoryPCs(umi.qc) ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'is_cell_control' with fewer than 2 unique levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_100_features_feature_control' with fewer than 2 ## unique levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_200_features_feature_control' with fewer than 2 ## unique levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_500_features_feature_control' with fewer than 2 ## unique levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_50_features_ERCC' with fewer than 2 unique ## levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_100_features_ERCC' with fewer than 2 unique ## levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_200_features_ERCC' with fewer than 2 unique ## levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_500_features_ERCC' with fewer than 2 unique ## levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_50_features_MT' with fewer than 2 unique levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_100_features_MT' with fewer than 2 unique ## levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_200_features_MT' with fewer than 2 unique ## levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_500_features_MT' with fewer than 2 unique ## levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'use' with fewer than 2 unique levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'outlier' with fewer than 2 unique levels ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf Figure 7.20: PC correlation with the number of detected genes Indeed, we can see that PC1 can be almost completely explained by batch and individual (of course batch is nested within individual). The total counts from ERCC spike-ins also explains a substantial proportion of the variability in PC1. Although number of detected genes is not strongly correlated with the PCs here (after normalization), this is commonly the case and something to look out for. [You might like to replicate the plot above using raw logcounts values to see what happens without normalization]. This is a well-known issue in scRNA-seq and was described here. 7.4.3 Explanatory variables scater can also compute the marginal \\(R^2\\) for each variable when fitting a linear model regressing expression values for each gene against just that variable, and display a density plot of the gene-wise marginal \\(R^2\\) values for the variables. plotExplanatoryVariables( umi.qc, exprs_values = "logcounts_raw", variables = c( "total_features_by_counts", "total_counts", "batch", "individual", "pct_counts_ERCC", "pct_counts_MT" ) ) Figure 7.21: Explanatory variables This analysis indicates that the number of detected genes (again) and also the sequencing depth (total number of UMI counts per cell) have substantial explanatory power for many genes, so these variables are good candidates for conditioning out in a normalization step, or including in downstream statistical models [cf. sctransform’s approach to normalization]. Expression of ERCCs also appears to be an important explanatory variable and one notable feature of the above plot is that batch explains more than individual. What does that tell us about the technical and biological variability of the data? 7.4.4 Other confounders In addition to correcting for batch, there are other factors that one may want to compensate for. As with batch correction, these adjustments require extrinsic information. One popular method is scLVM which allows you to identify and subtract the effect from processes such as cell-cycle or apoptosis. In addition, protocols may differ in terms of their coverage of each transcript, their bias based on the average content of A/T nucleotides, or their ability to capture short transcripts. Ideally, we would like to compensate for all of these differences and biases. 7.4.5 Exercise Perform the same analysis with read counts of the Blischak data. Use tung/reads.rds file to load the reads SCESet object. Once you have finished please compare your results to ours (next chapter). 7.4.6 sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] scran_1.12.1 scater_1.12.2 ## [3] ggplot2_3.2.1 SingleCellExperiment_1.6.0 ## [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 ## [7] BiocParallel_1.18.1 matrixStats_0.55.0 ## [9] Biobase_2.44.0 GenomicRanges_1.36.1 ## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 ## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 ## [15] knitr_1.25 ## ## loaded via a namespace (and not attached): ## [1] locfit_1.5-9.1 Rcpp_1.0.2 ## [3] rsvd_1.0.2 lattice_0.20-38 ## [5] assertthat_0.2.1 digest_0.6.21 ## [7] R6_2.4.0 dynamicTreeCut_1.63-1 ## [9] evaluate_0.14 highr_0.8 ## [11] pillar_1.4.2 zlibbioc_1.30.0 ## [13] rlang_0.4.0 lazyeval_0.2.2 ## [15] irlba_2.3.3 Matrix_1.2-17 ## [17] rmarkdown_1.15 labeling_0.3 ## [19] BiocNeighbors_1.2.0 statmod_1.4.32 ## [21] stringr_1.4.0 igraph_1.2.4.1 ## [23] RCurl_1.95-4.12 munsell_0.5.0 ## [25] compiler_3.6.0 vipor_0.4.5 ## [27] BiocSingular_1.0.0 xfun_0.9 ## [29] pkgconfig_2.0.3 ggbeeswarm_0.6.0 ## [31] htmltools_0.3.6 tidyselect_0.2.5 ## [33] tibble_2.1.3 gridExtra_2.3 ## [35] GenomeInfoDbData_1.2.1 bookdown_0.13 ## [37] edgeR_3.26.8 viridisLite_0.3.0 ## [39] crayon_1.3.4 dplyr_0.8.3 ## [41] withr_2.1.2 bitops_1.0-6 ## [43] grid_3.6.0 gtable_0.3.0 ## [45] magrittr_1.5 scales_1.0.0 ## [47] dqrng_0.2.1 stringi_1.4.3 ## [49] XVector_0.24.0 viridis_0.5.1 ## [51] limma_3.40.6 DelayedMatrixStats_1.6.1 ## [53] cowplot_1.0.0 tools_3.6.0 ## [55] glue_1.3.1 beeswarm_0.2.3 ## [57] purrr_0.3.2 yaml_2.2.0 ## [59] colorspace_1.4-1 7.5 Identifying confounding factors (Reads) ## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its ## own size factors ## Warning in .get_all_sf_sets(object): spike-in set 'MT' should have its own ## size factors Figure 7.22: PCA plot of the tung data ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'is_cell_control' with fewer than 2 unique levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_100_features_feature_control' with fewer than 2 ## unique levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_200_features_feature_control' with fewer than 2 ## unique levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_500_features_feature_control' with fewer than 2 ## unique levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_50_features_ERCC' with fewer than 2 unique ## levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_100_features_ERCC' with fewer than 2 unique ## levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_200_features_ERCC' with fewer than 2 unique ## levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_500_features_ERCC' with fewer than 2 unique ## levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_50_features_MT' with fewer than 2 unique levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_100_features_MT' with fewer than 2 unique ## levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_200_features_MT' with fewer than 2 unique ## levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'pct_counts_in_top_500_features_MT' with fewer than 2 unique ## levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'use' with fewer than 2 unique levels ## Warning in getVarianceExplained(dummy, exprs_values = "pc_space", ...): ## ignoring 'outlier' with fewer than 2 unique levels ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf ## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning ## -Inf Figure 7.23: PC correlation with the number of detected genes Figure 7.24: Explanatory variables ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] scran_1.12.1 scater_1.12.2 ## [3] ggplot2_3.2.1 SingleCellExperiment_1.6.0 ## [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 ## [7] BiocParallel_1.18.1 matrixStats_0.55.0 ## [9] Biobase_2.44.0 GenomicRanges_1.36.1 ## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 ## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 ## ## loaded via a namespace (and not attached): ## [1] locfit_1.5-9.1 Rcpp_1.0.2 ## [3] rsvd_1.0.2 lattice_0.20-38 ## [5] assertthat_0.2.1 digest_0.6.21 ## [7] R6_2.4.0 dynamicTreeCut_1.63-1 ## [9] evaluate_0.14 highr_0.8 ## [11] pillar_1.4.2 zlibbioc_1.30.0 ## [13] rlang_0.4.0 lazyeval_0.2.2 ## [15] irlba_2.3.3 Matrix_1.2-17 ## [17] rmarkdown_1.15 labeling_0.3 ## [19] BiocNeighbors_1.2.0 statmod_1.4.32 ## [21] stringr_1.4.0 igraph_1.2.4.1 ## [23] RCurl_1.95-4.12 munsell_0.5.0 ## [25] compiler_3.6.0 vipor_0.4.5 ## [27] BiocSingular_1.0.0 xfun_0.9 ## [29] pkgconfig_2.0.3 ggbeeswarm_0.6.0 ## [31] htmltools_0.3.6 tidyselect_0.2.5 ## [33] tibble_2.1.3 gridExtra_2.3 ## [35] GenomeInfoDbData_1.2.1 bookdown_0.13 ## [37] edgeR_3.26.8 viridisLite_0.3.0 ## [39] crayon_1.3.4 dplyr_0.8.3 ## [41] withr_2.1.2 bitops_1.0-6 ## [43] grid_3.6.0 gtable_0.3.0 ## [45] magrittr_1.5 scales_1.0.0 ## [47] dqrng_0.2.1 stringi_1.4.3 ## [49] XVector_0.24.0 viridis_0.5.1 ## [51] limma_3.40.6 DelayedMatrixStats_1.6.1 ## [53] cowplot_1.0.0 tools_3.6.0 ## [55] glue_1.3.1 beeswarm_0.2.3 ## [57] purrr_0.3.2 yaml_2.2.0 ## [59] colorspace_1.4-1 knitr_1.25 7.6 Batch effects 7.6.1 Introduction In the previous chapter we normalized for library size, effectively removing it as a confounder. Now we will consider removing other less well defined confounders from our data. Technical confounders (aka batch effects) can arise from difference in reagents, isolation methods, the lab/experimenter who performed the experiment, even which day/time the experiment was performed. Accounting for technical confounders, and batch effects particularly, is a large topic that also involves principles of experimental design. Here we address approaches that can be taken to account for confounders when the experimental design is appropriate. Fundamentally, accounting for technical confounders involves identifying and, ideally, removing sources of variation in the expression data that are not related to (i.e. are confounding) the biological signal of interest. Various approaches exist, some of which use spike-in or housekeeping genes, and some of which use endogenous genes. 7.6.1.1 Advantages and disadvantages of using spike-ins to remove confounders The use of spike-ins as control genes is conceptually appealing, since (ideally) the same amount of ERCC (or other) spike-in would be added to each cell in our experiment. In principle, all the variability we observe for these ``genes’’ is due to technical noise; whereas endogenous genes are affected by both technical noise and biological variability. Technical noise can be removed by fitting a model to the spike-ins and “substracting†this from the endogenous genes. There are several methods available based on this premise (eg. BASiCS, scLVM, RUVg); each using different noise models and different fitting procedures. Alternatively, one can identify genes which exhibit significant variation beyond technical noise (eg. Distance to median, Highly variable genes). Unfortunately, there are major issues with the use of spike-ins for normalisation that limit their utility in practice. Perhaps surprisingly, their variability can, for various reasons, actually be higher than that of endogenous genes. One key reason for the difficulty of their use in practice is the need to pipette miniscule volumes of spike-in solution into The most popular set of spike-ins, namely ERCCs, are derived from bacterial sequences, which raises concerns that their base content and structure diverges to far from gene structure in other biological systems of interest (e.g. mammalian genes) to be reliable for normalisation. Even in the best-case scenarios, spike-ins are limited to use on plate-based platforms; they are fundamentally incompatible with droplet-based platforms. Given the issues with using spike-ins, better results can often be obtained by using endogenous genes instead. Given their limited availability, normalisation methods based only on endogenous genes needed to be developed and we consider them generally preferable, even for platforms where spike-ins may be used. Where we have a large number of endogenous genes that, on average, do not vary systematically between cells and where we expect technical effects to affect a large number of genes (a very common and reasonable assumption), then such methods (for example, the RUVs method) can perform well. We explore both general approaches below. library(scRNA.seq.funcs) library(RUVSeq) library(scater) library(SingleCellExperiment) library(scran) library(kBET) library(sva) # Combat library(edgeR) library(harmony) set.seed(1234567) options(stringsAsFactors = FALSE) umi <- readRDS("data/tung/umi.rds") umi.qc <- umi[rowData(umi)$use, colData(umi)$use] endog_genes <- !rowData(umi.qc)$is_feature_control erccs <- rowData(umi.qc)$is_feature_control ## Apply scran sum factor normalization qclust <- quickCluster(umi.qc, min.size = 30, use.ranks = FALSE) umi.qc <- computeSumFactors(umi.qc, sizes = 15, clusters = qclust) umi.qc <- normalize(umi.qc) 7.6.2 Linear models Linear models offer a relatively simple approach to accounting for batch effects and confounders. A linear model can correct for batches while preserving biological effects if you have a balanced design. In a confounded/replicate design biological effects will not be fit/preserved. We could remove batch effects from each individual separately in order to preserve biological (and technical) variance between individuals (we will apply a similar with mnnCorrect, below). Depending on how we have pre-processed our scRNA-seq data or what modelling assumptions we are willing to make, we may choose to use normal (Gaussian) linear models (i.e. assuming a normal distribution for noise) or generalized linear models (GLM), where we can use any distribution from the exponential family. Given that we obtain highly-variable count data from scRNA-seq assays, the obvious choice for a GLM is to use the negative binomial distribution, which has proven highly successful in the analysis of bulk RNA-seq data. For demonstration purposes here we will naively correct all confounded batch effects. 7.6.2.1 Gaussian (normal) linear models The limma package in Bioconductor offers a convenient and efficient means to fit a linear model (with the same design matrix) to a dataset with a large number of features (i.e. genes) (Ritchie et al. 2015). An added advantage of limma is its ability to apply empirical Bayes squeezing of variance estimate to improve inference. Provided we are satisfied making the assumption of a Gaussian distribution for residuals (this may be reasonable for normalized log-counts in many cases; but it may not be—debate continues in the literature), then we can apply limma to regress out (known) unwanted sources of variation as follows. ## fit a model just accounting for batch lm_design_batch <- model.matrix(~0 + batch, data = colData(umi.qc)) fit_lm_batch <- lmFit(logcounts(umi.qc), lm_design_batch) resids_lm_batch <- residuals(fit_lm_batch, logcounts(umi.qc)) assay(umi.qc, "lm_batch") <- resids_lm_batch reducedDim(umi.qc, "PCA_lm_batch") <- reducedDim( runPCA(umi.qc[endog_genes, ], exprs_values = "lm_batch"), "PCA") plotReducedDim(umi.qc, use_dimred = "PCA_lm_batch", colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) + ggtitle("LM - regress out batch") Two problems are immediately apparent with the approach above. First, batch is nested within individual, so simply regressing out batch as we have done above also regresses out differences between individuals that we would like to preserve. Second, we observe that the first principal component seems to separate cells by number of genes (features) expressed, which is undesirable. We can address these concerns by correcting for batch within each individual separately, and also fitting the proportion of genes expressed per cell as a covariate. [NB: to preserve overall differences in expression levels between individuals we will need to apply a slight hack to the LM fit results (setting the intercept coefficient to zero).] Exercise 2 Perform LM correction for each individual separately. Store the final corrected matrix in the lm_batch_indi slot. ## define cellular detection rate (cdr), i.e. proportion of genes expressed in each cell umi.qc$cdr <- umi.qc$total_features_by_counts_endogenous / nrow(umi.qc) ## fit a model just accounting for batch by individual lm_design_batch1 <- model.matrix(~batch + cdr, data = colData(umi.qc)[umi.qc$individual == "na19098",]) fit_indi1 <- lmfit(logcounts(umi.qc)[, umi.qc$individual == "na19098"], lm_design_batch1) fit_indi1$coefficients[,1] <- 0 ## replace intercept with 0 to preserve reference batch resids_lm_batch1 <- residuals(fit_indi1, logcounts(umi.qc)[, umi.qc$individual == "na19098"]) lm_design_batch2 <- model.matrix(~batch + cdr, data = colData(umi.qc)[umi.qc$individual == "na19101",]) fit_indi2 <- lmfit(logcounts(umi.qc)[, umi.qc$individual == "na19101"], lm_design_batch2) fit_indi2$coefficients[,1] <- 0 ## replace intercept with 0 to preserve reference batch resids_lm_batch2 <- residuals(fit_indi2, logcounts(umi.qc)[, umi.qc$individual == "na19101"]) lm_design_batch3 <- model.matrix(~batch + cdr, data = colData(umi.qc)[umi.qc$individual == "na19239",]) fit_indi3 <- lmfit(logcounts(umi.qc)[, umi.qc$individual == "na19239"], lm_design_batch3) fit_indi3$coefficients[,1] <- 0 ## replace intercept with 0 to preserve reference batch resids_lm_batch3 <- residuals(fit_indi3, logcounts(umi.qc)[, umi.qc$individual == "na19239"]) identical(colnames(umi.qc), colnames(cbind(resids_lm_batch1, resids_lm_batch2, resids_lm_batch3))) assay(umi.qc, "lm_batch_indi") <- cbind(resids_lm_batch1, resids_lm_batch2, resids_lm_batch3) reduceddim(umi.qc, "pca_lm_batch_indi") <- reduceddim( runpca(umi.qc[endog_genes, ], exprs_values = "lm_batch_indi"), "pca") plotreduceddim(umi.qc, use_dimred = "pca_lm_batch_indi", colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) + ggtitle("lm - regress out batch within individuals separately") What do you think of the results of this approach? 7.6.2.2 Negative binomial generalized linear models 7.6.3 sctransform The sctransform approach to using Pearson residuals from an regularized negative binomial generalized linear model was introduced above. Here we demonstrate how to apply this method. Note that (due to what looks like a bug in this version of sctransform) we need to convert the UMI count matrix to a sparse format to apply sctransform. These sctransform results will face the problem mentioned above of batch being nested within individual, which means that we cannot directly remove batch effects without removing differences between individuals. However, here we will demonstrate how you would try to remove batch effects with sctransform for a kinder experimental design. umi_sparse <- as(counts(umi.qc), "dgCMatrix") ### Genes expressed in at least 5 cells will be kept sctnorm_data <- sctransform::vst(umi = umi_sparse, min_cells = 1, cell_attr = as.data.frame(colData(umi.qc)), latent_var = c("log10_total_counts_endogenous", "batch")) ## Calculating cell attributes for input UMI matrix ## Variance stabilizing transformation of count matrix of size 14066 by 657 ## Model formula is y ~ log10_total_counts_endogenous + batch ## Get Negative Binomial regression parameters per gene ## Using 2000 genes, 657 cells ## | | | 0% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in sqrt(1/i): NaNs produced ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## | |======== | 12% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## | |================ | 25% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## | |======================== | 38% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## | |================================ | 50% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## | |========================================= | 62% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## | |================================================= | 75% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## | |========================================================= | 88% ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## Warning in theta.ml(y = y, mu = fit$fitted): iteration limit reached ## | |=================================================================| 100% ## Found 123 outliers - those will be ignored in fitting/regularization step ## Second step: Get residuals using fitted parameters for 14066 genes ## | | | 0% | |= | 2% | |== | 4% | |==== | 5% | |===== | 7% | |====== | 9% | |======= | 11% | |======== | 13% | |========= | 15% | |=========== | 16% | |============ | 18% | |============= | 20% | |============== | 22% | |=============== | 24% | |================= | 25% | |================== | 27% | |=================== | 29% | |==================== | 31% | |===================== | 33% | |====================== | 35% | |======================== | 36% | |========================= | 38% | |========================== | 40% | |=========================== | 42% | |============================ | 44% | |============================== | 45% | |=============================== | 47% | |================================ | 49% | |================================= | 51% | |================================== | 53% | |=================================== | 55% | |===================================== | 56% | |====================================== | 58% | |======================================= | 60% | |======================================== | 62% | |========================================= | 64% | |=========================================== | 65% | |============================================ | 67% | |============================================= | 69% | |============================================== | 71% | |=============================================== | 73% | |================================================ | 75% | |================================================== | 76% | |=================================================== | 78% | |==================================================== | 80% | |===================================================== | 82% | |====================================================== | 84% | |======================================================== | 85% | |========================================================= | 87% | |========================================================== | 89% | |=========================================================== | 91% | |============================================================ | 93% | |============================================================= | 95% | |=============================================================== | 96% | |================================================================ | 98% | |=================================================================| 100% ## Calculating gene attributes ## Wall clock passed: Time difference of 28.12818 secs ## Pearson residuals, or deviance residuals sctnorm_data$model_str ## [1] "y ~ log10_total_counts_endogenous + batch" assay(umi.qc, "sctrans_norm") <- sctnorm_data$y Let us look at the NB GLM model parameters estimated by sctransform. #sce$log10_total_counts ## Matrix of estimated model parameters per gene (theta and regression coefficients) sctransform::plot_model_pars(sctnorm_data) Do these parameters and the regularization look sensible to you? Any concerns? reducedDim(umi.qc, "PCA_sctrans_norm") <- reducedDim( runPCA(umi.qc[endog_genes, ], exprs_values = "sctrans_norm") ) plotReducedDim( umi.qc, use_dimred = "PCA_sctrans_norm", colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) + ggtitle("PCA plot: sctransform normalization") Figure 7.7: PCA plot of the tung data after sctransform normalisation (Pearson residuals). Q: What’s happened here? Was that expected? Any other comments? 7.6.4 Remove Unwanted Variation Factors contributing to technical noise frequently appear as “batch effects†where cells processed on different days or by different technicians systematically vary from one another. Removing technical noise and correcting for batch effects can frequently be performed using the same tool or slight variants on it. We will be considering the Remove Unwanted Variation (RUVSeq). Briefly, RUVSeq works as follows. For \\(n\\) samples and \\(J\\) genes, consider the following generalized linear model (GLM), where the RNA-Seq read counts are regressed on both the known covariates of interest and unknown factors of unwanted variation: \\[\\log E[Y|W,X,O] = W\\alpha + X\\beta + O\\] Here, \\(Y\\) is the \\(n \\times J\\) matrix of observed gene-level read counts, \\(W\\) is an \\(n \\times k\\) matrix corresponding to the factors of “unwanted variation†and \\(O\\) is an \\(n \\times J\\) matrix of offsets that can either be set to zero or estimated with some other normalization procedure (such as upper-quartile normalization). The simultaneous estimation of \\(W\\), \\(\\alpha\\), \\(\\beta\\), and \\(k\\) is infeasible. For a given \\(k\\), instead the following three approaches to estimate the factors of unwanted variation \\(W\\) are used: RUVg uses negative control genes (e.g. ERCCs), assumed to have constant expression across samples; RUVs uses centered (technical) replicate/negative control samples for which the covariates of interest are constant; RUVr uses residuals, e.g., from a first-pass GLM regression of the counts on the covariates of interest. We will concentrate on the first two approaches. 7.6.4.1 RUVg To use RUVg we will use ERCCs as negative control genes to anchor the estimation of factors representing unwanted variation. RUVg operates on the raw count data. We adjust the output normalized counts from RUVg so that they represent normalized counts-per-million and then apply a log2 transformation. We run RUVg twice, with \\(k=1\\) and \\(k=10\\) so that we can compare the effect of estimating different number of hidden factors to capture unwanted variation in the data. ruvg <- RUVg(counts(umi.qc), erccs, k = 1) assay(umi.qc, "ruvg1") <- log2( t(t(ruvg$normalizedCounts) / colSums(ruvg$normalizedCounts) * 1e6) + 1 ) ruvg <- RUVg(counts(umi.qc), erccs, k = 10) assay(umi.qc, "ruvg10") <- log2( t(t(ruvg$normalizedCounts) / colSums(ruvg$normalizedCounts) * 1e6) + 1 ) When we assess the effectiveness of various batch correction methods below, you can discuss whether or not you think using ERCCs as negative control genes for a method like RUVg is advisable (in this dataset and in general). 7.6.4.2 RUVs In this application of RUVs we treat the individuals as replicates for which the covariates of interest are constant. As above, we adjust the output normalized counts from RUVs so that they represent normalized counts-per-million and then apply a log2 transformation. Again, we run the method with \\(k=1\\) and \\(k=10\\) so that we can compare the effect of estimating different number of hidden factors. scIdx <- matrix(-1, ncol = max(table(umi.qc$individual)), nrow = 3) tmp <- which(umi.qc$individual == "NA19098") scIdx[1, 1:length(tmp)] <- tmp tmp <- which(umi.qc$individual == "NA19101") scIdx[2, 1:length(tmp)] <- tmp tmp <- which(umi.qc$individual == "NA19239") scIdx[3, 1:length(tmp)] <- tmp cIdx <- rownames(umi.qc) ruvs <- RUVs(counts(umi.qc), cIdx, k = 1, scIdx = scIdx, isLog = FALSE) assay(umi.qc, "ruvs1") <- log2( t(t(ruvs$normalizedCounts) / colSums(ruvs$normalizedCounts) * 1e6) + 1 ) ruvs <- RUVs(counts(umi.qc), cIdx, k = 10, scIdx = scIdx, isLog = FALSE) assay(umi.qc, "ruvs10") <- log2( t(t(ruvs$normalizedCounts) / colSums(ruvs$normalizedCounts) * 1e6) + 1 ) 7.6.5 Combat If you have an experiment with a balanced design, Combat can be used to eliminate batch effects while preserving biological effects by specifying the biological effects using the mod parameter. However the Tung data contains multiple experimental replicates rather than a balanced design so using mod1 to preserve biological variability will result in an error. combat_data <- logcounts(umi.qc) mod_data <- as.data.frame(t(combat_data)) # Basic batch removal mod0 <- model.matrix(~ 1, data = mod_data) # Preserve biological variability mod1 <- model.matrix(~ umi.qc$individual, data = mod_data) # adjust for total genes detected mod2 <- model.matrix(~ umi.qc$total_features_by_counts, data = mod_data) assay(umi.qc, "combat") <- ComBat( dat = t(mod_data), batch = factor(umi.qc$batch), mod = mod0, par.prior = TRUE, prior.plots = FALSE ) ## Standardizing Data across genes Exercise 1 Perform ComBat correction accounting for total features as a co-variate. Store the corrected matrix in the combat_tf slot. 7.6.6 mnnCorrect mnnCorrect (Haghverdi et al. 2017) assumes that each batch shares at least one biological condition with each other batch. Thus it works well for a variety of balanced experimental designs. However, the Tung data contains multiple replicates for each invidividual rather than balanced batches, thus we will normalize each individual separately. Note that this will remove batch effects between batches within the same individual but not the batch effects between batches in different individuals, due to the confounded experimental design. Thus we will merge a replicate from each individual to form three batches. do_mnn <- function(data.qc) { batch1 <- logcounts(data.qc[, data.qc$replicate == "r1"]) batch2 <- logcounts(data.qc[, data.qc$replicate == "r2"]) batch3 <- logcounts(data.qc[, data.qc$replicate == "r3"]) if (ncol(batch2) > 0) { x <- batchelor::mnnCorrect( batch1, batch2, batch3, k = 20, sigma = 0.1, cos.norm.in = TRUE, svd.dim = 2 ) return(x) } else { x <- batchelor::mnnCorrect( batch1, batch3, k = 20, sigma = 0.1, cos.norm.in = TRUE, svd.dim = 2 ) return(x) } } indi1 <- do_mnn(umi.qc[, umi.qc$individual == "NA19098"]) indi2 <- do_mnn(umi.qc[, umi.qc$individual == "NA19101"]) indi3 <- do_mnn(umi.qc[, umi.qc$individual == "NA19239"]) identical(colnames(umi.qc), colnames(cbind(indi1, indi2, indi3))) ## [1] TRUE assay(umi.qc, "mnn") <- assay(cbind(indi1, indi2, indi3), "corrected") # For a balanced design: #assay(umi.qc, "mnn") <- mnnCorrect( # list(B1 = logcounts(batch1), B2 = logcounts(batch2), B3 = logcounts(batch3)), # k = 20, # sigma = 0.1, # cos.norm = TRUE, # svd.dim = 2 #) The latest version of the batchelor package has a new fastMNN() method. The fastMNN() function performs a principal components (PCA). MNN identification and correction is preformed on this low-dimensional representation of the data, an approach that offers some advantages in speed and denoising. The function returns a SingleCellExperiment object containing a matrix of corrected PC scores, which can be used directly for downstream analyses like clustering and visualization. [NB: fastMNN may actually be slower on small datasets like that considered here.] indi1 <- batchelor::fastMNN( umi.qc[, umi.qc$individual == "NA19098"], batch = umi.qc[, umi.qc$individual == "NA19098"]$replicate) ## Warning in sweep(centered, 2, w, "/", check.margin = FALSE): 'check.margin' is ignored when 'x' is a DelayedArray object or ## derivative indi2 <- batchelor::fastMNN( umi.qc[, umi.qc$individual == "NA19101"], batch = umi.qc[, umi.qc$individual == "NA19101"]$replicate) ## Warning in sweep(centered, 2, w, "/", check.margin = FALSE): 'check.margin' is ignored when 'x' is a DelayedArray object or ## derivative indi3 <- batchelor::fastMNN( umi.qc[, umi.qc$individual == "NA19239"], batch = umi.qc[, umi.qc$individual == "NA19239"]$replicate) ## Warning in sweep(centered, 2, w, "/", check.margin = FALSE): 'check.margin' is ignored when 'x' is a DelayedArray object or ## derivative identical(colnames(umi.qc), colnames(cbind(assay(indi1, "reconstructed"), assay(indi2, "reconstructed"), assay(indi3, "reconstructed")))) ## [1] TRUE fastmnn <- cbind(assay(indi1, "reconstructed"), assay(indi2, "reconstructed"), assay(indi3, "reconstructed")) identical(rownames(umi.qc), rownames(fastmnn)) ## [1] FALSE ## fastMNN() drops 66 genes, so we cannot immediately add the reconstructed expression matrix to assays() in umi.qc ## But we can run PCA on the reconstructed data from fastMNN() and add that to the reducedDim slot of our SCE object fastmnn_pca <- runPCA(fastmnn, rank=2) reducedDim(umi.qc, "fastmnn") <- fastmnn_pca$rotation For further details, please consult the batchelor package documentation and vignette. 7.6.7 Harmony Harmony [Korsunsky2018fast] is a newer batch correction method, which is designed to operate on PC space. The algorithm proceeds to iteratively cluster the cells, with the objective function formulated to promote cells from multiple datasets within each cluster. Once a clustering is obtained, the positions of the centroids of each dataset are obtained on a per-cluster basis and the coordinates are corrected. This procedure is iterated until convergence. Harmony comes with a theta parameter that controls the degree of batch correction (higher values lead to more dataset integration), and can account for multiple experimental and biological factors on input. Seeing how the end result of Harmony is an altered dimensional reduction space created on the basis of PCA, we plot the obtained manifold here and exclude it from the rest of the follow-ups in the section. umi.qc.endog <- umi.qc[endog_genes,] umi.qc.endog <- runPCA(umi.qc.endog, exprs_values = 'logcounts', ncomponents = 20) pca <- as.matrix(reducedDim(umi.qc.endog, "PCA")) harmony_emb <- HarmonyMatrix(pca, umi.qc.endog$batch, theta=2, do_pca=FALSE) ## Harmony 1/10 ## Harmony 2/10 ## Harmony 3/10 ## Harmony 4/10 ## Harmony 5/10 ## Harmony 6/10 ## Harmony 7/10 ## Harmony 8/10 ## Harmony 9/10 ## Harmony 10/10 reducedDim(umi.qc.endog, "harmony") <- harmony_emb plotReducedDim( umi.qc.endog, use_dimred = 'harmony', colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) reducedDim(umi.qc, "harmony") <- reducedDim(umi.qc.endog, "harmony") 7.6.8 How to evaluate and compare batch correction A key question when considering the different methods for removing confounders is how to quantitatively determine which one is the most effective. The main reason why comparisons are challenging is because it is often difficult to know what corresponds to technical counfounders and what is interesting biological variability. Here, we consider three different metrics which are all reasonable based on our knowledge of the experimental design. Depending on the biological question that you wish to address, it is important to choose a metric that allows you to evaluate the confounders that are likely to be the biggest concern for the given situation. 7.6.8.1 Effectiveness 1 We evaluate the effectiveness of the normalization by inspecting the PCA plot where colour corresponds the technical replicates and shape corresponds to different biological samples (individuals). Separation of biological samples and interspersed batches indicates that technical variation has been removed. We always use log2-cpm normalized data to match the assumptions of PCA. for (nm in assayNames(umi.qc)) { cat(nm, " \\n") tmp <- runPCA( umi.qc[endog_genes, ], exprs_values = nm ) reducedDim(umi.qc, paste0("PCA_", nm)) <- reducedDim(tmp, "PCA") } ## counts ## logcounts_raw ## logcounts ## sctrans_norm ## ruvg1 ## ruvg10 ## ruvs1 ## ruvs10 ## combat ## mnn for (nm in reducedDimNames(umi.qc)) { print( plotReducedDim( umi.qc, use_dimred = nm, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) + ggtitle(nm) ) } Exercise 3 Consider different k’s for RUV normalizations. Which gives the best results? 7.6.8.2 Effectiveness 2 We can also examine the effectiveness of correction using the relative log expression (RLE) across cells to confirm technical noise has been removed from the dataset. Note RLE only evaluates whether the number of genes higher and lower than average are equal for each cell - i.e. systemic technical effects. Random technical noise between batches may not be detected by RLE. res <- list() for(n in assayNames(umi.qc)) { res[[n]] <- suppressWarnings(calc_cell_RLE(assay(umi.qc, n), erccs)) } par(mar=c(6,4,1,1)) boxplot(res, las=2) 7.6.8.3 Effectiveness 3 Another method to check the efficacy of batch-effect correction is to consider the intermingling of points from different batches in local subsamples of the data. If there are no batch-effects then proportion of cells from each batch in any local region should be equal to the global proportion of cells in each batch. kBET (Buttner et al. 2017) takes kNN networks around random cells and tests the number of cells from each batch against a binomial distribution. The rejection rate of these tests indicates the severity of batch-effects still present in the data (high rejection rate = strong batch effects). kBET assumes each batch contains the same complement of biological groups, thus it can only be applied to the entire dataset if a perfectly balanced design has been used. However, kBET can also be applied to replicate-data if it is applied to each biological group separately. In the case of the Tung data, we will apply kBET to each individual independently to check for residual batch effects. However, this method will not identify residual batch-effects which are confounded with biological conditions. In addition, kBET does not determine if biological signal has been preserved. compare_kBET_results <- function(sce){ indiv <- unique(sce$individual) norms <- assayNames(sce) # Get all normalizations results <- list() for (i in indiv){ for (j in norms){ tmp <- kBET( df = t(assay(sce[,sce$individual== i], j)), batch = sce$batch[sce$individual==i], heuristic = TRUE, verbose = FALSE, addTest = FALSE, plot = FALSE) results[[i]][[j]] <- tmp$summary$kBET.observed[1] } } return(as.data.frame(results)) } eff_debatching <- compare_kBET_results(umi.qc) require("reshape2") require("RColorBrewer") # Plot results dod <- melt(as.matrix(eff_debatching), value.name = "kBET") colnames(dod)[1:2] <- c("Normalisation", "Individual") colorset <- c('gray', brewer.pal(n = 9, "Oranges")) ggplot(dod, aes(Normalisation, Individual, fill=kBET)) + geom_tile() + scale_fill_gradient2( na.value = "gray", low = colorset[2], mid=colorset[6], high = colorset[10], midpoint = 0.5, limit = c(0,1)) + scale_x_discrete(expand = c(0, 0)) + scale_y_discrete(expand = c(0, 0)) + theme( axis.text.x = element_text( angle = 45, vjust = 1, size = 12, hjust = 1 ) ) + ggtitle("Effect of batch regression methods per individual") Exercise 4 Why do the raw counts appear to have little batch effects? 7.6.9 Big Exercise Perform the same analysis with read counts of the tung data. Use tung/reads.rds file to load the reads SCE object. Once you have finished please compare your results to ours (next chapter). Additionally, experiment with other combinations of normalizations and compare the results. 7.6.10 sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] stats4 parallel stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] RColorBrewer_1.1-2 reshape2_1.4.3 ## [3] harmony_1.0 Rcpp_1.0.2 ## [5] sva_3.32.1 genefilter_1.66.0 ## [7] mgcv_1.8-28 nlme_3.1-139 ## [9] kBET_0.99.6 scran_1.12.1 ## [11] scater_1.12.2 ggplot2_3.2.1 ## [13] SingleCellExperiment_1.6.0 RUVSeq_1.18.0 ## [15] edgeR_3.26.8 limma_3.40.6 ## [17] EDASeq_2.18.0 ShortRead_1.42.0 ## [19] GenomicAlignments_1.20.1 SummarizedExperiment_1.14.1 ## [21] DelayedArray_0.10.0 matrixStats_0.55.0 ## [23] Rsamtools_2.0.1 GenomicRanges_1.36.1 ## [25] GenomeInfoDb_1.20.0 Biostrings_2.52.0 ## [27] XVector_0.24.0 IRanges_2.18.3 ## [29] S4Vectors_0.22.1 BiocParallel_1.18.1 ## [31] Biobase_2.44.0 BiocGenerics_0.30.0 ## [33] scRNA.seq.funcs_0.1.0 ## ## loaded via a namespace (and not attached): ## [1] backports_1.1.4 aroma.light_3.14.0 ## [3] plyr_1.8.4 igraph_1.2.4.1 ## [5] lazyeval_0.2.2 splines_3.6.0 ## [7] listenv_0.7.0 elliptic_1.4-0 ## [9] digest_0.6.21 htmltools_0.3.6 ## [11] viridis_0.5.1 magrittr_1.5 ## [13] memoise_1.1.0 contfrac_1.1-12 ## [15] cluster_2.1.0 globals_0.12.4 ## [17] annotate_1.62.0 R.utils_2.9.0 ## [19] prettyunits_1.0.2 colorspace_1.4-1 ## [21] blob_1.2.0 xfun_0.9 ## [23] dplyr_0.8.3 crayon_1.3.4 ## [25] RCurl_1.95-4.12 zeallot_0.1.0 ## [27] survival_2.43-3 glue_1.3.1 ## [29] gtable_0.3.0 zlibbioc_1.30.0 ## [31] BiocSingular_1.0.0 future.apply_1.3.0 ## [33] scales_1.0.0 DESeq_1.36.0 ## [35] DBI_1.0.0 viridisLite_0.3.0 ## [37] xtable_1.8-4 progress_1.2.2 ## [39] dqrng_0.2.1 bit_1.1-14 ## [41] rsvd_1.0.2 deSolve_1.24 ## [43] httr_1.4.1 FNN_1.1.3 ## [45] pkgconfig_2.0.3 XML_3.98-1.20 ## [47] R.methodsS3_1.7.1 locfit_1.5-9.1 ## [49] dynamicTreeCut_1.63-1 tidyselect_0.2.5 ## [51] labeling_0.3 rlang_0.4.0 ## [53] AnnotationDbi_1.46.1 munsell_0.5.0 ## [55] tools_3.6.0 moments_0.14 ## [57] RSQLite_2.1.2 batchelor_1.0.1 ## [59] evaluate_0.14 stringr_1.4.0 ## [61] yaml_2.2.0 knitr_1.25 ## [63] bit64_0.9-7 hypergeo_1.2-13 ## [65] purrr_0.3.2 future_1.14.0 ## [67] R.oo_1.22.0 biomaRt_2.40.4 ## [69] compiler_3.6.0 beeswarm_0.2.3 ## [71] tibble_2.1.3 statmod_1.4.32 ## [73] geneplotter_1.62.0 stringi_1.4.3 ## [75] highr_0.8 GenomicFeatures_1.36.4 ## [77] lattice_0.20-38 Matrix_1.2-17 ## [79] vctrs_0.2.0 lifecycle_0.1.0 ## [81] pillar_1.4.2 BiocNeighbors_1.2.0 ## [83] cowplot_1.0.0 bitops_1.0-6 ## [85] orthopolynom_1.0-5 irlba_2.3.3 ## [87] rtracklayer_1.44.4 R6_2.4.0 ## [89] latticeExtra_0.6-28 hwriter_1.3.2 ## [91] bookdown_0.13 gridExtra_2.3 ## [93] vipor_0.4.5 codetools_0.2-16 ## [95] MASS_7.3-51.1 assertthat_0.2.1 ## [97] withr_2.1.2 sctransform_0.2.0 ## [99] GenomeInfoDbData_1.2.1 hms_0.5.1 ## [101] grid_3.6.0 tidyr_1.0.0 ## [103] rmarkdown_1.15 DelayedMatrixStats_1.6.1 ## [105] Rtsne_0.15 ggbeeswarm_0.6.0 7.7 Dealing with confounders (Reads) library(scRNA.seq.funcs) library(RUVSeq) library(scater) library(SingleCellExperiment) library(scran) library(kBET) library(sva) # Combat library(harmony) library(edgeR) set.seed(1234567) options(stringsAsFactors = FALSE) reads <- readRDS("data/tung/reads.rds") reads.qc <- reads[rowData(reads)$use, colData(reads)$use] endog_genes <- !rowData(reads.qc)$is_feature_control erccs <- rowData(reads.qc)$is_feature_control qclust <- quickCluster(reads.qc, min.size = 30) reads.qc <- computeSumFactors(reads.qc, sizes = 15, clusters = qclust) reads.qc <- normalize(reads.qc) ruvg <- RUVg(counts(reads.qc), erccs, k = 1) assay(reads.qc, "ruvg1") <- log2( t(t(ruvg$normalizedCounts) / colSums(ruvg$normalizedCounts) * 1e6) + 1 ) ruvg <- RUVg(counts(reads.qc), erccs, k = 10) assay(reads.qc, "ruvg10") <- log2( t(t(ruvg$normalizedCounts) / colSums(ruvg$normalizedCounts) * 1e6) + 1 ) scIdx <- matrix(-1, ncol = max(table(reads.qc$individual)), nrow = 3) tmp <- which(reads.qc$individual == "NA19098") scIdx[1, 1:length(tmp)] <- tmp tmp <- which(reads.qc$individual == "NA19101") scIdx[2, 1:length(tmp)] <- tmp tmp <- which(reads.qc$individual == "NA19239") scIdx[3, 1:length(tmp)] <- tmp cIdx <- rownames(reads.qc) ruvs <- RUVs(counts(reads.qc), cIdx, k = 1, scIdx = scIdx, isLog = FALSE) assay(reads.qc, "ruvs1") <- log2( t(t(ruvs$normalizedCounts) / colSums(ruvs$normalizedCounts) * 1e6) + 1 ) ruvs <- RUVs(counts(reads.qc), cIdx, k = 10, scIdx = scIdx, isLog = FALSE) assay(reads.qc, "ruvs10") <- log2( t(t(ruvs$normalizedCounts) / colSums(ruvs$normalizedCounts) * 1e6) + 1 ) combat_data <- logcounts(reads.qc) mod_data <- as.data.frame(t(combat_data)) # Basic batch removal mod0 = model.matrix(~ 1, data = mod_data) # Preserve biological variability mod1 = model.matrix(~ reads.qc$individual, data = mod_data) # adjust for total genes detected mod2 = model.matrix(~ reads.qc$total_features_by_counts, data = mod_data) assay(reads.qc, "combat") <- ComBat( dat = t(mod_data), batch = factor(reads.qc$batch), mod = mod0, par.prior = TRUE, prior.plots = FALSE ) Exercise 1 do_mnn <- function(data.qc) { batch1 <- logcounts(data.qc[, data.qc$replicate == "r1"]) batch2 <- logcounts(data.qc[, data.qc$replicate == "r2"]) batch3 <- logcounts(data.qc[, data.qc$replicate == "r3"]) if (ncol(batch2) > 0) { x <- batchelor::mnnCorrect( batch1, batch2, batch3, k = 20, sigma = 0.1, cos.norm.in = TRUE, svd.dim = 2 ) return(x) } else { x <- batchelor::mnnCorrect( batch1, batch3, k = 20, sigma = 0.1, cos.norm.in = TRUE, svd.dim = 2 ) return(x) } } indi1 <- do_mnn(reads.qc[, reads.qc$individual == "NA19098"]) indi2 <- do_mnn(reads.qc[, reads.qc$individual == "NA19101"]) indi3 <- do_mnn(reads.qc[, reads.qc$individual == "NA19239"]) assay(reads.qc, "mnn") <- cbind(indi1, indi2, indi3) # For a balanced design: #assay(reads.qc, "mnn") <- mnnCorrect( # list(B1 = logcounts(batch1), B2 = logcounts(batch2), B3 = logcounts(batch3)), # k = 20, # sigma = 0.1, # cos.norm = TRUE, # svd.dim = 2 #) glm_fun <- function(g, batch, indi) { model <- glm(g ~ batch + indi) model$coef[1] <- 0 # replace intercept with 0 to preserve reference batch. return(model$coef) } effects <- apply( logcounts(reads.qc), 1, glm_fun, batch = reads.qc$batch, indi = reads.qc$individual ) corrected <- logcounts(reads.qc) - t(effects[as.numeric(factor(reads.qc$batch)), ]) assay(reads.qc, "glm") <- corrected Exercise 2 reads.qc.endog = reads.qc[endog_genes,] reads.qc.endog = runPCA(reads.qc.endog, exprs_values = 'logcounts', ncomponents = 20) pca <- as.matrix(reads.qc.endog@reducedDims@listData[["PCA"]]) harmony_emb <- HarmonyMatrix(pca, reads.qc.endog$batch, theta=2, do_pca=FALSE) reads.qc.endog@reducedDims@listData[['harmony']] <- harmony_emb plotReducedDim( reads.qc.endog, use_dimred = 'harmony', colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) for(n in assayNames(reads.qc)) { tmp <- runPCA( reads.qc[endog_genes, ], exprs_values = n ) print( plotPCA( tmp, colour_by = "batch", size_by = "total_features_by_counts", shape_by = "individual" ) + ggtitle(n) ) } res <- list() for(n in assayNames(reads.qc)) { res[[n]] <- suppressWarnings(calc_cell_RLE(assay(reads.qc, n), erccs)) } par(mar=c(6,4,1,1)) boxplot(res, las=2) compare_kBET_results <- function(sce){ indiv <- unique(sce$individual) norms <- assayNames(sce) # Get all normalizations results <- list() for (i in indiv){ for (j in norms){ tmp <- kBET( df = t(assay(sce[,sce$individual== i], j)), batch = sce$batch[sce$individual==i], heuristic = TRUE, verbose = FALSE, addTest = FALSE, plot = FALSE) results[[i]][[j]] <- tmp$summary$kBET.observed[1] } } return(as.data.frame(results)) } eff_debatching <- compare_kBET_results(reads.qc) require("reshape2") require("RColorBrewer") # Plot results dod <- melt(as.matrix(eff_debatching), value.name = "kBET") colnames(dod)[1:2] <- c("Normalisation", "Individual") colorset <- c('gray', brewer.pal(n = 9, "RdYlBu")) ggplot(dod, aes(Normalisation, Individual, fill=kBET)) + geom_tile() + scale_fill_gradient2( na.value = "gray", low = colorset[2], mid=colorset[6], high = colorset[10], midpoint = 0.5, limit = c(0,1)) + scale_x_discrete(expand = c(0, 0)) + scale_y_discrete(expand = c(0, 0)) + theme( axis.text.x = element_text( angle = 45, vjust = 1, size = 12, hjust = 1 ) ) + ggtitle("Effect of batch regression methods per individual") 7.8 Feature Selection library(scRNA.seq.funcs) library(matrixStats) library(M3Drop) library(RColorBrewer) library(SingleCellExperiment) library(Polychrome) library(scater) library(scran) set.seed(1) Single-cell RNASeq is capable of measuring the expression of many thousands of genes in every cell. However, in most situations only a portion of those will show a response to the biological condition of interest, e.g. differences in cell-type, drivers of differentiation, respond to an environmental stimulus. Most genes detected in a scRNASeq experiment will only be detected at different levels due to technical noise. One consequence of this is that technical noise and batch effects can obscure the biological signal of interest. Thus, it is often advantageous to perform feature selection to remove those genes which only exhibit technical noise from downstream analysis. Not only does this generally increase the signal:noise ratio in the data; it also reduces the computational complexity of analyses, by reducing the total amount of data to be processed. For scRNASeq data, we will be focusing on unsupervised methods of feature selection which don’t require any a priori information, such as cell-type labels or biological group, since they are not available, or may be unreliable, for many experiments. In contrast, differential expression (chapter 12) can be considered a form of supervised feature selection since it uses the known biological label of each sample to identify features (i.e. genes) which are expressed at different levels across groups. For this section we will continue working with the Deng data. deng <- readRDS("data/deng/deng-reads.rds") celltype_labs <- colData(deng)$cell_type2 cell_colors <- createPalette(10, c("#010101", "#ff0000"), M=1000) names(cell_colors) <- unique(as.character(celltype_labs)) Feature selection is performed after QC, however this data has already been QCed so we can skip that step here. M3Drop contain two different feature selection methods “M3DropFeatureSelection†which is based on a Michaelis-Menten curve and is designed for full-transcript single-cell RNA-seq data (such as Smartseq2) and “NBumiFeatureSelectionCombinedDrop†which is based on a negative binomial model and is designed for UMI count data. We will demonstrate both on the Deng Smartseq2 data. M3Drop feature selection is runs direction on a normalized (but not log-transformed) expression matrix. This can be extracted from our SingleCellExperiment object using the command below. expr_matrix <- M3Drop::M3DropConvertData(deng) ## [1] "Removing 1134 undetected genes." This function is compatible with most single-cell RNA-seq analysis packages including: scater, SingleCellExperiment, monocle, and Seurat. It can also convert an existing expression matrix to the correct form (removing undetected genes & normalizing/delogging) if you specify whether the matrix is raw counts, or log transformed. Check the manual for details: ?M3Drop::M3DropConvertData Exercise 1: Confirm that the conversion function has removed undetected genes: 7.8.1 Identifying Genes vs a Null Model There are two main approaches to unsupervised feature selection. The first is to identify genes which behave differently from a null model describing just the technical noise expected in the dataset. If the dataset contains spike-in RNAs they can be used to directly model technical noise. However, measurements of spike-ins may not experience the same technical noise as endogenous transcripts (Svensson et al., 2017). In addition, scRNASeq experiments often contain only a small number of spike-ins which reduces our confidence in fitted model parameters. 7.8.1.1 Highly Variable Genes - Brennecke method The first method proposed to identify features in scRNASeq datasets was to identify highly variable genes (HVG). HVG assumes that if genes have large differences in expression across cells some of those differences are due to biological difference between the cells rather than technical noise. However, because of the nature of count data, there is a positive relationship between the mean expression of a gene and the variance in the read counts across cells. This relationship must be corrected for to properly identify HVGs. Exercise 2 Using the functions rowMeans and rowVars to plot the relationship between mean expression and variance for all genes in this dataset. (Hint: use log=“xy†to plot on a log-scale). An early method to correct for the relationship between variance and mean expression was proposed by Brennecke et al.. To use the Brennecke method, we first normalize for library size then calculate the mean and the square coefficient of variation (variation divided by the squared mean expression). A quadratic curve is fit to the relationship between these two variables for the ERCC spike-in, and then a chi-square test is used to find genes significantly above the curve. This method is included in the M3Drop package as the Brennecke_getVariableGenes(counts, spikes) function. However, this dataset does not contain spike-ins so we will use the entire dataset to estimate the technical noise. In the figure below the red curve is the fitted technical noise model and the dashed line is the 95% CI. Pink dots are the genes with significant biological variability after multiple-testing correction. Brennecke_HVG <- BrenneckeGetVariableGenes( expr_matrix, fdr = 0.01, minBiolDisp = 0.5 ) This function returns a matrix of significant genes as well as their estimated effect size (difference between observed and expected coefficient of variation), and their significance as raw p.values and FDR corrected q.values. For now we will just keep the names of the significant HVG genes. HVG_genes <- Brennecke_HVG$Gene Exercise 3 How many genes were signifcant using BrenneckeGetVariableGenes? ## [1] 1303 7.8.1.2 Highly Variable Genes - simpleSingleCell method The Bioconductor simpleSingleCell workflow has a great deal of excellent material to help your analyses. Here, we show how to identify highly variable genes using functionality from the scran package. This method assumes that technical variance is captured by a Poisson distribution, and that variance beyond that explained by a Poisson distribution represents biological variance of interest. This approach separates the biological component of the variance from the technical component and thus can rank genes based on their “biological†variance. This model also provides p-values (with FDR adjustment) that can be used to identify the set of “significant†highly variable genes at a given significance level. ### mamke a technical trend of variance based on Poisson var.fit <- trendVar(deng, parametric=TRUE, loess.args=list(span=0.4), use.spikes = FALSE) var.out <- decomposeVar(deng, var.fit) plot(var.out$mean, var.out$total, pch=16, cex=0.6, xlab="Mean log-expression", ylab="Variance of log-expression") points(var.out$mean[isSpike(deng)], var.out$total[isSpike(deng)], col="red", pch=16) curve(var.fit$trend(x), col="dodgerblue", add=TRUE, lwd=2) chosen.genes <- order(var.out$bio, decreasing=TRUE)[1:10] plotExpression(deng, rownames(var.out)[chosen.genes], point_alpha=0.5, jitter_type="jitter") top.dec <- var.out[order(var.out$bio, decreasing=TRUE),] # the highly variable genes with largest biological components head(top.dec) ## DataFrame with 6 rows and 6 columns ## mean total bio ## <numeric> <numeric> <numeric> ## Obox6 7.0852220910669 39.7469062194493 27.7222625676479 ## BC053393 6.23846872763624 36.7868129334449 22.7409221497424 ## Krt18 8.06957111931139 30.7163256353151 21.3338604240051 ## Upp1 6.70443458808406 32.9196031154138 19.9537242012223 ## Akr1b8 9.31035205790714 25.9351262454146 19.563014227718 ## Spp1 5.52672835522051 34.8140952020968 19.5492807120572 ## tech p.value FDR ## <numeric> <numeric> <numeric> ## Obox6 12.0246436518013 6.67046481158613e-67 4.98750653962295e-64 ## BC053393 14.0458907837025 1.89687518927716e-40 5.90955657926056e-38 ## Krt18 9.38246521130992 1.28064383710762e-65 9.26649093876163e-63 ## Upp1 12.9658789141915 1.39045180596497e-37 3.89865305745004e-35 ## Akr1b8 6.37211201769662 2.70679041028919e-99 5.51963779029062e-96 ## Spp1 15.2648144900397 9.4641203490752e-29 1.76908069625088e-26 simplesinglecell_genes <- rownames(top.dec)[top.dec$FDR < 0.001] table(top.dec$FDR < 0.001) ## ## FALSE TRUE ## 21124 1307 If we set an FDR threshold of 0.1%, this approach identifies around 1300 highly variable genes. The output of this variance modelling can be used as input to a denoisePCA() function to compute “denoised†principal components for clustering and other downstream analyses (details not shown here; please see the simpleSingleCell workflow). 7.8.1.3 High Dropout Genes An alternative to finding HVGs is to identify genes with unexpectedly high numbers of zeros. The frequency of zeros, known as the “dropout rateâ€, is very closely related to expression level in scRNASeq data. Zeros are the dominant feature of single-cell RNASeq data, typically accounting for over half of the entries in the final expression matrix. These zeros predominantly result from the failure of mRNAs failing to be reversed transcribed (Andrews and Hemberg, 2016). Reverse transcription is an enzyme reaction thus can be modelled using the Michaelis-Menten equation: \\[P_{dropout} = 1 - S/(K + S)\\] where \\(S\\) is the mRNA concentration in the cell (we will estimate this as average expression) and \\(K\\) is the Michaelis-Menten constant. Because the Michaelis-Menten equation is a convex non-linear function, genes which are differentially expression across two or more populations of cells in our dataset will be shifted up/right of the Michaelis-Menten model (see Figure below). K <- 49 S_sim <- 10^seq(from = -3, to = 4, by = 0.05) # range of expression values MM <- 1 - S_sim / (K + S_sim) plot( S_sim, MM, type = "l", lwd = 3, xlab = "Expression", ylab = "Dropout Rate", xlim = c(1,1000) ) S1 <- 10 # Mean expression in population 1 P1 <- 1 - S1 / (K + S1) # Dropouts for cells in condition 1 S2 <- 750 # Mean expression in population 2 P2 <- 1 - S2 / (K + S2) # Dropouts for cells in condition 2 points( c(S1, S2), c(P1, P2), pch = 16, col = "grey85", cex = 3 ) mix <- 0.5 # proportion of cells in condition 1 points( S1 * mix + S2 * (1 - mix), P1 * mix + P2 * (1 - mix), pch = 16, col = "grey35", cex = 3 ) Note: add log="x" to the plot call above to see how this looks on the log scale, which is used in M3Drop figures. Exercise 4: Produce the same plot as above with different expression levels (S1 & S2) and/or mixtures (mix). We use M3Drop to identify significant outliers to the right of the MM curve. We also apply 1% FDR multiple testing correction: M3Drop_genes <- M3DropFeatureSelection( expr_matrix, mt_method = "fdr", mt_threshold = 0.01 ) M3Drop_genes <- M3Drop_genes$Gene An alternative method is contained in the M3Drop package that is tailored specifically for UMI-tagged data which generally contains many zeros resulting from low sequencing coverage in addition to those resulting from insufficient reverse-transcription. This model is the Depth-Adjusted Negative Binomial (DANB). This method describes each expression observation as a negative binomial model with a mean related to both the mean expression of the respective gene and the sequencing depth of the respective cell, and a variance related to the mean-expression of the gene. This method is designed to model the raw counts in a dataset directly, and we can extract the appropriate matrix using the “NBumiConvertData†function similar to M3Drop. However, we have an extra step for fitting the model since that is the slowest step of the method and we are currently working on additional methods that can use this model information for other things (such as normalization, co-expression testing, highly variable gene detection). This method includes a binomial test of the significance of each feature, but since the Deng data is not UMI counts the model does not fit the noise sufficiently and far too many genes will be called as significant. Thus we will take the top 1500 by effect size. deng_int <- NBumiConvertData(deng) ## [1] "Removing 1134 undetected genes." DANB_fit <- NBumiFitModel(deng_int) # DANB is fit to the raw count matrix # Perform DANB feature selection DropFS <- NBumiFeatureSelectionCombinedDrop(DANB_fit, method="fdr", qval.thresh=0.01, suppress.plot=FALSE) DANB_genes <- DropFS[1:1500,]$Gene Exercise 5 How many genes were signifcant using NBumiFeatureSelectionCombinedDrop? ## [1] 10694 7.8.1.4 Residual variance from a (regularized) negative binomial model In the normalization chapter we introduced the sctransform approach to using Pearson residuals from an regularized negative binomial generalized linear model to normalize scRNA-seq data. The residual variance of genes (i.e. the variance of the Pearson residuals) provides a way to identify highly variable genes, where the “variance†is decoupled from the average level of expression of the gene. The residual variance is easily accessible from the sctransform output as we show below. First, we run sctransform as we did previously. deng_sparse <- as(counts(deng), "dgCMatrix") ### Genes expressed in at least 5 cells will be kept sctnorm_data <- sctransform::vst(umi = deng_sparse, min_cells = 1, cell_attr = as.data.frame(colData(deng)), latent_var = "log10_total_counts_endogenous") ## | | | 0% | |======== | 12% | |================ | 25% | |======================== | 38% | |================================ | 50% | |========================================= | 62% | |================================================= | 75% | |========================================================= | 88% | |=================================================================| 100% ## | | | 0% | |= | 1% | |== | 2% | |== | 4% | |=== | 5% | |==== | 6% | |===== | 7% | |===== | 8% | |====== | 10% | |======= | 11% | |======== | 12% | |========= | 13% | |========= | 14% | |========== | 15% | |=========== | 17% | |============ | 18% | |============ | 19% | |============= | 20% | |============== | 21% | |=============== | 23% | |=============== | 24% | |================ | 25% | |================= | 26% | |================== | 27% | |=================== | 29% | |=================== | 30% | |==================== | 31% | |===================== | 32% | |====================== | 33% | |====================== | 35% | |======================= | 36% | |======================== | 37% | |========================= | 38% | |========================== | 39% | |========================== | 40% | |=========================== | 42% | |============================ | 43% | |============================= | 44% | |============================= | 45% | |============================== | 46% | |=============================== | 48% | |================================ | 49% | |================================ | 50% | |================================= | 51% | |================================== | 52% | |=================================== | 54% | |==================================== | 55% | |==================================== | 56% | |===================================== | 57% | |====================================== | 58% | |======================================= | 60% | |======================================= | 61% | |======================================== | 62% | |========================================= | 63% | |========================================== | 64% | |=========================================== | 65% | |=========================================== | 67% | |============================================ | 68% | |============================================= | 69% | |============================================== | 70% | |============================================== | 71% | |=============================================== | 73% | |================================================ | 74% | |================================================= | 75% | |================================================== | 76% | |================================================== | 77% | |=================================================== | 79% | |==================================================== | 80% | |===================================================== | 81% | |===================================================== | 82% | |====================================================== | 83% | |======================================================= | 85% | |======================================================== | 86% | |======================================================== | 87% | |========================================================= | 88% | |========================================================== | 89% | |=========================================================== | 90% | |============================================================ | 92% | |============================================================ | 93% | |============================================================= | 94% | |============================================================== | 95% | |=============================================================== | 96% | |=============================================================== | 98% | |================================================================ | 99% | |=================================================================| 100% sctnorm_data$model_str ## [1] "y ~ log10_total_counts_endogenous" library(ggplot2) ggplot(sctnorm_data$gene_attr, aes(residual_variance)) + geom_histogram(binwidth=0.1) + geom_vline(xintercept=1, color='red') + xlim(0, 10) sctnorm_data$gene_attr$label <- rownames(sctnorm_data$gene_attr) ggplot(sctnorm_data$gene_attr, aes(x = gmean, y=residual_variance)) + geom_point(alpha = 0.6) + geom_point(colour = "firebrick2", data = sctnorm_data$gene_attr[sctnorm_data$gene_attr$residual_variance > 3,]) + scale_x_log10() + geom_hline(yintercept = 1, size = 3, color = "dodgerblue") + geom_label(aes(label = label), data = sctnorm_data$gene_attr[sctnorm_data$gene_attr$residual_variance > 30,]) + theme_bw() sct_genes <- rownames(sctnorm_data$gene_attr)[sctnorm_data$gene_attr$residual_variance > 4] table(sctnorm_data$gene_attr$residual_variance > 4) ## ## FALSE TRUE ## 20077 1220 If we set a (relatively arbitrary) threshold of a residual variance greater than three marking a “highly variable geneâ€, then we identify around 2000 highly variable genes with this sctransform approach. [NB: the deng data is extremely high depth for scRNA-seq data, so not the most applicable dataset for sctransform, but we include this analysis here to demonstrate the method rather than make any evaluation of its performance in general.] Although not explored here, the deviance statistic from the regularized NB GLM fit provides a natural way to select informative features for downstream analyses. The deviance is a goodness-of-fit statistic for a statistical model. As Wikipedia notes, deviance is a generalization of the idea of using the sum of squares of residuals in ordinary least squares to cases where model-fitting is achieved by maximum likelihood. It plays an important role in exponential dispersion models and generalized linear models, such as the negative binomial model. However, sctransform does not seem set up to use the model deviance to select informative features, but we expect this could be a direction the field goes in the near future. Keep an eye out! 7.8.2 Correlated Expression A completely different approach to feature selection is to use gene-gene correlations. This method is based on the idea that multiple genes will be differentially expressed between different cell-types or cell-states. Genes which are expressed in the same cell-population will be positively correlated with each other where as genes expressed in different cell-populations will be negatively correated with each other. Thus important genes can be identified by the magnitude of their correlation with other genes. The limitation of this method is that it assumes technical noise is random and independent for each cell, thus shouldn’t produce gene-gene correlations, but this assumption is violated by batch effects which are generally systematic between different experimental batches and will produce gene-gene correlations. As a result it is more appropriate to take the top few thousand genes as ranked by gene-gene correlation than consider the significance of the correlations. cor_feat <- M3Drop::corFS(expr_matrix) Cor_genes <- names(cor_feat)[1:1500] 7.8.3 Comparing Methods We can check whether the identified features really do represent genes differentially expressed between cell-types in this dataset. M3DropExpressionHeatmap( M3Drop_genes, expr_matrix, cell_labels = celltype_labs ) We can also consider how consistent each feature selection method is with the others using the Jaccard Index: J <- sum(M3Drop_genes %in% HVG_genes)/length(unique(c(M3Drop_genes, HVG_genes))) Exercise 6 Plot the expression of the features for each of the other methods. Which appear to be differentially expressed? How consistent are the different methods for this dataset? M3DropExpressionHeatmap( DANB_genes, expr_matrix, cell_labels = celltype_labs ) Jaccard index comparison of sets of informative features: list_of_features <- list( M3Drop_genes, DANB_genes, HVG_genes, simplesinglecell_genes, sct_genes ) Out <- matrix( 0, ncol = length(list_of_features), nrow = length(list_of_features) ) for(i in 1:length(list_of_features) ) { for(j in 1:length(list_of_features) ) { Out[i,j] <- sum(list_of_features[[i]] %in% list_of_features[[j]])/ length(unique(c(list_of_features[[i]], list_of_features[[j]]))) } } colnames(Out) <- rownames(Out) <- c("M3Drop", "DANB", "Brennecke", "simpleSingleCell", "sctransform") Out ## M3Drop DANB Brennecke simpleSingleCell ## M3Drop 1.0000000 0.38019061 0.4152905 0.14615908 ## DANB 0.3801906 1.00000000 0.2283346 0.09868187 ## Brennecke 0.4152905 0.22833459 1.0000000 0.15019157 ## simpleSingleCell 0.1461591 0.09868187 0.1501916 1.00000000 ## sctransform 0.2343257 0.21801471 0.2718985 0.26034913 ## sctransform ## M3Drop 0.2343257 ## DANB 0.2180147 ## Brennecke 0.2718985 ## simpleSingleCell 0.2603491 ## sctransform 1.0000000 7.8.4 sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] scran_1.12.1 scater_1.12.2 ## [3] ggplot2_3.2.1 Polychrome_1.2.3 ## [5] SingleCellExperiment_1.6.0 SummarizedExperiment_1.14.1 ## [7] DelayedArray_0.10.0 BiocParallel_1.18.1 ## [9] Biobase_2.44.0 GenomicRanges_1.36.1 ## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 ## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 ## [15] RColorBrewer_1.1-2 M3Drop_1.10.0 ## [17] numDeriv_2016.8-1.1 matrixStats_0.55.0 ## [19] scRNA.seq.funcs_0.1.0 ## ## loaded via a namespace (and not attached): ## [1] Rtsne_0.15 ggbeeswarm_0.6.0 ## [3] colorspace_1.4-1 dynamicTreeCut_1.63-1 ## [5] htmlTable_1.13.2 XVector_0.24.0 ## [7] base64enc_0.1-3 BiocNeighbors_1.2.0 ## [9] rstudioapi_0.10 listenv_0.7.0 ## [11] codetools_0.2-16 splines_3.6.0 ## [13] knitr_1.25 Formula_1.2-3 ## [15] cluster_2.1.0 sctransform_0.2.0 ## [17] compiler_3.6.0 dqrng_0.2.1 ## [19] backports_1.1.4 assertthat_0.2.1 ## [21] Matrix_1.2-17 lazyeval_0.2.2 ## [23] limma_3.40.6 BiocSingular_1.0.0 ## [25] acepack_1.4.1 htmltools_0.3.6 ## [27] tools_3.6.0 rsvd_1.0.2 ## [29] igraph_1.2.4.1 gtable_0.3.0 ## [31] glue_1.3.1 GenomeInfoDbData_1.2.1 ## [33] reshape2_1.4.3 dplyr_0.8.3 ## [35] Rcpp_1.0.2 bbmle_1.0.20 ## [37] gdata_2.18.0 nlme_3.1-139 ## [39] DelayedMatrixStats_1.6.1 xfun_0.9 ## [41] stringr_1.4.0 globals_0.12.4 ## [43] irlba_2.3.3 gtools_3.8.1 ## [45] hypergeo_1.2-13 statmod_1.4.32 ## [47] future_1.14.0 edgeR_3.26.8 ## [49] zlibbioc_1.30.0 MASS_7.3-51.1 ## [51] scales_1.0.0 yaml_2.2.0 ## [53] gridExtra_2.3 rpart_4.1-15 ## [55] latticeExtra_0.6-28 stringi_1.4.3 ## [57] checkmate_1.9.4 orthopolynom_1.0-5 ## [59] contfrac_1.1-12 caTools_1.17.1.2 ## [61] rlang_0.4.0 pkgconfig_2.0.3 ## [63] moments_0.14 bitops_1.0-6 ## [65] evaluate_0.14 lattice_0.20-38 ## [67] purrr_0.3.2 htmlwidgets_1.3 ## [69] labeling_0.3 cowplot_1.0.0 ## [71] tidyselect_0.2.5 deSolve_1.24 ## [73] plyr_1.8.4 magrittr_1.5 ## [75] bookdown_0.13 R6_2.4.0 ## [77] gplots_3.0.1.1 Hmisc_4.2-0 ## [79] pillar_1.4.2 foreign_0.8-70 ## [81] withr_2.1.2 mgcv_1.8-28 ## [83] survival_2.43-3 scatterplot3d_0.3-41 ## [85] RCurl_1.95-4.12 nnet_7.3-12 ## [87] future.apply_1.3.0 tibble_2.1.3 ## [89] crayon_1.3.4 KernSmooth_2.23-15 ## [91] rmarkdown_1.15 viridis_0.5.1 ## [93] locfit_1.5-9.1 grid_3.6.0 ## [95] data.table_1.12.2 reldist_1.6-6 ## [97] digest_0.6.21 elliptic_1.4-0 ## [99] munsell_0.5.0 beeswarm_0.2.3 ## [101] viridisLite_0.3.0 vipor_0.4.5 References "], +["handling-sparsity.html", "8 Handling sparsity 8.1 Challenge: Handling sparsity in single-cell RNA sequencing 8.2 Status 8.3 Open problems", " 8 Handling sparsity The material below is reproduced from (Laehnemann et al. 2019): Laehnemann,D. et al. (2019) 12 Grand challenges in single-cell data science PeerJ Preprints. link 8.1 Challenge: Handling sparsity in single-cell RNA sequencing A comprehensive characterization of the transcriptional status of individual cells enables us to gain full insight into the interplay of transcripts within single cells. However, scRNA-seq measurements typically suffer from large fractions of observed zeros, where a given gene in a given cell has no unique molecule identifiers or reads mapping to it. These observed zero values can represent either missing data (i.e.~a gene is expressed but not detected by the sequencing technology) or true absence of expression. The proportion of zeros, or degree of sparsity, is thought to be due to imperfect reverse transcription and amplification, and other technical limitations (), and depends on the scRNA-seq platform used, the sequencing depth and the underlying expression level of the gene. The term ``dropout’’ is often used to denote observed zero values in scRNA-seq data, but this term conflates zero values attributable to methodological noise and biologically-true zero expression, so we recommend against its use as a catch-all term for observed zeros. Sparsity in scRNA-seq data can hinder downstream analyses, but it is challenging to model or handle it appropriately, and thus, there remains an ongoing need for improved methods. Sparsity pervades all aspects of scRNA-seq data analysis, but here we focus on the linked problems of learning latent spaces and imputing'' expression values from scRNA-seq data (\\autoref{fig:denoising-imputation}). Imputation,data smoothing’’ and ``data reconstruction’’ approaches are closely linked to the challenges of normalization. But whereas normalization generally aims to make expression values between cells more comparable to each other, imputation and data smoothing approaches aim to achieve adjusted data values that—it is hoped—better represent the true expression values. Imputation methods could therefore be used for normalization, but do not entail all possible or useful approaches to normalization. 8.2 Status The imputation of missing values has been very successful for genotype data. Crucially, when imputing genotypes we often know which data are missing (e.g.~when no genotype call is possible due to no coverage of a locus, although see section for the challenges with data) and rich sources of external information are available (e.g.~haplotype reference panels). Thus, genotype imputation is now highly accurate and a commonly-used step in data processing for genetic association studies . The situation is somewhat different for scRNA-seq data, as we do not routinely have external reference information to apply (see ). In addition, we can never be sure which observed zeros represent missing data'' and which accurately represent a true gene expression level in the cell \\citep{hicks_missing_2018}. Observed zeros can either representbiological’’ zeros, i.e.~those present because the true expression level of a gene in a cell was zero. Or they they are the result of methodological noise, which can arise when a gene has true non-zero expression in a cell, but no counts are observed due to failures at any point in the complicated process of processing mRNA transcripts in cells into mapped reads. Such noise can lead to artefactual zero that are either more systematic (e.g.~sequence-specific mRNA degradation during cell lysis) or that occur by chance (e.g.~barely expressed transcripts that at the same expression level will sometimes be detected and sometimes not, due to sampling variation, e.g~in the sequencing). The high degree of sparsity in scRNA-seq data therefore arises from technical zeros and true biological zeros, which are difficult to distinguish from one another. In general, two broad approaches can be applied to tackle this problem of sparsity: use statistical models that inherently model the sparsity, sampling variation and noise modes of scRNA-seq data with an appropriate data generative model; or attempt to ``impute’’ values for observed zeros (ideally the technical zeros; sometimes also non-zero values) that better approximate the true gene expression levels. We prefer to use the first option where possible, and for many single-cell data analysis problems, statistical models appropriate for sparse count data exist and should be used (e.g.~for differential expression analysis). However, there are many cases where the appropriate models are not available and accurate imputation of technical zeros would allow better results from downstream methods and algorithms that cannot handle sparse count data. For example, imputation could be particularly useful for many dimension reduction, visualization and clustering applications. It is therefore desirable to improve both statistical methods that work on sparse count data directly and approaches for data imputation for scRNA-seq data, whether by refining existing techniques or developing new ones (see also ). We define three broad (and sometimes overlapping) categories of methods that can be used to ``impute’’ scRNA-seq data in the absence of an external reference: __Model-based imputation methods of technical zeros_ use probabilistic models to identify which observed zeros represent technical rather than biological zeros and aim to impute expression levels just for these technical zeros, leaving other observed expression levels untouched; or __Data-smoothing methods_ define sets of ``similar’’ cells (e.g.~cells that are neighbors in a graph or occupy a small region in a latent space) and adjust expression values for each cell based on expression values in similar cells. These methods adjust all expression values, including technical zeros, biological zeros and observed non-zero values. __Data-reconstruction methods_ typically aim to define a latent space representation of the cells. This is often done through matrix factorization (e.g.~principal component analysis) or, increasingly, through machine learning approaches (e.g.~variational autoencoders that exploit deep neural networks to capture non-linear relationships). Although a broad class of methods, both matrix factorization methods and autoencoders (among others) are able to reconstruct'' the observed data matrix from low-rank or simplified representations. The reconstructed data matrix will typically no longer be sparse (with many zeros) and the implicitlyimputed’’ data can be used for downstream applications that cannot handle sparse count data. The first category of methods generally seeks to infer a probabilistic model that captures the data generation mechanism. Such generative models can be used to identify, probabilistically, which observed zeros correspond to technical zeros (to be imputed) and which correspond to biological zeros (to be left alone). There are many model-based imputation methods already available that use ideas from clustering (e.g.~k-means), dimension reduction, regression and other techniques to impute technical zeros, oftentimes combining ideas from several of these approaches. These include SAVER , ScImpute , bayNorm , scRecover , and VIPER . Clustering methods that implicitly impute values, such as CIDR and BISCUIT , are closely related to this class of imputation methods. Data-smoothing methods, which adjust all gene expression levels based on expression levels in similar'' cells, have also been proposed to handle imputation problems. We might regard these approaches asdenoising’’ methods. To take a simplified example (), we might imagine that single cells originally refer to points in two-dimensional space, but are likely to describe a one-dimensional curve; projecting data points onto that curve eventually allows imputation of the ``missing’’ values (but all points are adjusted, or smoothed, not just true technical zeros). Prominent data-smoothing approaches to handling sparse counts include: diffusion-based MAGIC k-nearest neighbor-based knn-smooth network diffusion-based netSmooth clustering-based DrImpute locality sensitive imputation in LSImpute A major task in the analysis of high-dimensional single-cell data is to find low-dimensional representations of the data that capture the salient biological signals and render the data more interpretable and amenable to further analyses. As it happens, the matrix factorization and latent-space learning methods used for that task also provide another route for imputation through their ability to the observed data matrix from simplified representations of it. is one such standard matrix factorization method that can be applied to scRNA-seq data (preferably after suitable data normalization) as are other widely-used general statistical methods like and . As (linear) matrix factorization methods, , and decompose the observed data matrix into a ``small’’ number of factors in two low-rank matrices, one representing cell-by-factor weights and one gene-by-factor loadings. Many matrix factorization methods with tweaks for single-cell data have been proposed in recent years, including: ZIFA, a zero-inflated factor analysis f-scLVM, a sparse Bayesian latent variable model GPLVM, a Gaussian process latent variable model ZINB-WaVE, a zero-inflated negative binomial factor model scCoGAPS, an extension of consensus , a meta-analysis approach to pCMF, probabilistic count matrix factorization with a Poisson model SDA, sparse decomposition of arrays; another sparse Bayesian method . Some data reconstruction approaches have been specifically proposed for imputation, including: ENHANCE, denoising with an aggregation step ALRA, SVD with adaptive thresholding scRMD, robust matrix decomposition Recently, machine learning methods have emerged that apply autoencoders and deep neural networks ) or ensemble learning ) to impute expression values. Additionally, many deep learning methods have been proposed for single-cell data analysis that can, but need not, use probabilistic data generative processes to capture low-dimensional or latent space representations of a dataset. Even if imputation is not a main focus, such methods can generate ``imputed’’ expression values as an upshot of a model primarily focused on other tasks like learning latent spaces, clustering, batch correction, or visualization (and often several of these tasks simultaneously). The latter set includes tools such as: DCA, an autoencoder with a zero-inflated negative binomial distribution scVI, a variational autoencoder with a zero-inflated negative binomial model LATE VASC compscVAE scScope Tybalt SAUCIE scvis net-SNE BERMUDA, focused on batch correction DUSC Expression Saliency others Besides the three categories described above, a small number of scRNA-seq imputation methods have been developed to incorporate information external to the current dataset for imputation. These include: ADImpute , which uses gene regulatory network information from external sources; SAVER-X , a transfer learning method for denoising and imputation that can use information from atlas-type resources; and methods that borrow information from matched bulk RNA-seq data like URSM and SCRABBLE . 8.3 Open problems A major challenge in this context is the circularity that arises when imputation solely relies on information that is internal to the imputed dataset. This circularity can artificially amplify the signal contained in the data, leading to inflated correlations between genes and/or cells. In turn, this can introduce false positives in downstream analyses such as differential expression testing and gene network inference . Handling batch effects and potential confounders requires further work to ensure that imputation methods do not mistake unwanted variation from technical sources for biological signal. In a similar vein, single-cell experiments are affected by various uncertainties (see ). Approaches that allow quantification and propagation of the uncertainties associated with expression measurements (), may help to avoid problems associated with ‘overimputation’ and the introduction of spurious signals noted by . To avoid this circularity, it is important to identify reliable external sources of information that can inform the imputation process. One possibility is to exploit external reference panels (like in the context of genetic association studies). Such panels are not generally available for scRNA-seq data, but ongoing efforts to develop large scale cell atlases could provide a valuable resource for this purpose. Systematic integration of known biological network structures is desirable and may also help to avoid circularity. A possible approach is to encode network structure knowledge as prior information, as attempted in netSmooth and ADImpute. Another alternative solution is to explore complementary types of data that can inform scRNA-seq imputation. This idea was adopted in SCRABBLE and URSM, where an external reference is defined by bulk expression measurements from the same population of cells for which imputation is performed. Yet another possibility could be to incorporate orthogonal information provided by different types of molecular measurements (see ). Methods designed to integrate multi-omics data could then be extended to enable scRNA-seq imputation, e.g.~through generative models that explicitly link scRNA-seq with other data types or by inferring a shared low-dimensional latent structure that could be used within a data-reconstruction framework. With the proliferation of alternative methods, comprehensive benchmarking is urgently required as for all areas of single-cell data analysis . Early attempts by and provide valuable insights into the performance of methods available at the time. But many more methods have since been proposed and even more comprehensive benchmarking platforms are needed. Many methods, especially those using deep learning, depend strongly on choice of hyperparameters . There, more detailed comparisons that explore parameter spaces would be helpful, extending work like that from comparing dimensionality reduction methods. Learning from exemplary benchmarking studies , it would be immensely beneficial to develop a community-supported benchmarking platform with a wide-range of synthetic and experiment ground-truth datasets (or as close as possible, in the case of experimental data) and a variety of thoughtful metrics for evaluating performance. Ideally, such a benchmarking platform would remain dynamic beyond an initial publication to allow ongoing comparison of methods as new approaches are proposed. Detailed benchmarking would also help to establish when normalization methods derived from explicit count models may be preferable to imputation. Finally, scalability for large numbers of cells remains an ongoing concern for imputation, data smoothing and data reconstruction methods, as for all high-throughput single-cell methods and software (see ). library(scater) ## Loading required package: SingleCellExperiment ## Loading required package: SummarizedExperiment ## Loading required package: GenomicRanges ## Loading required package: stats4 ## Loading required package: BiocGenerics ## Loading required package: parallel ## ## Attaching package: 'BiocGenerics' ## The following objects are masked from 'package:parallel': ## ## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, ## clusterExport, clusterMap, parApply, parCapply, parLapply, ## parLapplyLB, parRapply, parSapply, parSapplyLB ## The following objects are masked from 'package:stats': ## ## IQR, mad, sd, var, xtabs ## The following objects are masked from 'package:base': ## ## anyDuplicated, append, as.data.frame, basename, cbind, ## colnames, dirname, do.call, duplicated, eval, evalq, Filter, ## Find, get, grep, grepl, intersect, is.unsorted, lapply, Map, ## mapply, match, mget, order, paste, pmax, pmax.int, pmin, ## pmin.int, Position, rank, rbind, Reduce, rownames, sapply, ## setdiff, sort, table, tapply, union, unique, unsplit, which, ## which.max, which.min ## Loading required package: S4Vectors ## ## Attaching package: 'S4Vectors' ## The following object is masked from 'package:base': ## ## expand.grid ## Loading required package: IRanges ## Loading required package: GenomeInfoDb ## Loading required package: Biobase ## Welcome to Bioconductor ## ## Vignettes contain introductory material; view with ## 'browseVignettes()'. To cite Bioconductor, see ## 'citation("Biobase")', and for packages 'citation("pkgname")'. ## Loading required package: DelayedArray ## Loading required package: matrixStats ## ## Attaching package: 'matrixStats' ## The following objects are masked from 'package:Biobase': ## ## anyMissing, rowMedians ## Loading required package: BiocParallel ## ## Attaching package: 'DelayedArray' ## The following objects are masked from 'package:matrixStats': ## ## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges ## The following objects are masked from 'package:base': ## ## aperm, apply, rowsum ## Loading required package: ggplot2 ## ## Attaching package: 'scater' ## The following object is masked from 'package:S4Vectors': ## ## rename ## The following object is masked from 'package:stats': ## ## filter library(SingleCellExperiment) library(glmpca) library(ggplot2) library(Polychrome) library(slalom) References "], +["latent-spaces.html", "9 Latent spaces 9.1 Dimensionality reduction 9.2 Matrix factorization and factor analysis 9.3 Autoencoders", " 9 Latent spaces In many cases we may like to think of cells sitting in a low-dimensional, “latent†space that captures relationships between cells more intuitively than the very high-dimensional gene expression space. 9.1 Dimensionality reduction Why? - Reduce Curse of Dimensionality problems - Increase storage and computational efficiency - Visualize Data in 2D or 3D Difficulty: Need to decide how many dimension to keep. 9.1.1 PCA: Principal component analysis 9.1.1.1 (traditional) PCA PCA is a linear feature extraction technique. It performs a linear mapping of the data to a lower-dimensional space in such a way that the variance of the data in the low-dimensional representation is maximized. It does so by calculating the eigenvectors from the covariance matrix. The eigenvectors that correspond to the largest eigenvalues (the principal components) are used to reconstruct a significant fraction of the variance of the original data. In simpler terms, PCA combines your input features in a specific way that you can drop the least important feature while still retaining the most valuable parts of all of the features. As an added benefit, each of the new features or components created after PCA are all independent of one another. 9.1.1.1.1 Basic ideas of PCA Idea1: Dropping dimensions = Projection onto lower dimensional space   Which dimension should we keep? Idea2: more variantion = more information   But what if the plot is not readily to be projected onto either X or Y-axis? 9.1.1.1.2 Steps of PCA Step1: Rotation We want a set of axises (called Principle Components) that satisfies: -The 1st axis points to the direction where variantion is maximized, and so on -They are orthogonal to each other It can be shown that the eigen vectors of the covariance matrix satisfy these conditions, and the eigen vector according to the largest eigen value accounts for the most variation. Step2: Projection (3-dimesion \\(\\rightarrow\\) 2-dimension) 9.1.1.1.3 An example of PCA deng <- readRDS("data/deng/deng-reads.rds") my_color1 <- createPalette(6, c("#010101", "#ff0000"), M=1000) names(my_color1) <- unique(as.character(deng$cell_type1)) my_color2 <- createPalette(10, c("#010101", "#ff0000"), M=1000) names(my_color2) <- unique(as.character(deng$cell_type2)) deng <- runPCA(deng, ncomponents = 2) plotPCA(deng, colour_by = "cell_type1") + scale_fill_manual(values = my_color1) ## Scale for 'fill' is already present. Adding another scale for 'fill', ## which will replace the existing scale. plotPCA(deng, colour_by = "cell_type2") + scale_fill_manual(values = my_color2) ## Scale for 'fill' is already present. Adding another scale for 'fill', ## which will replace the existing scale. 9.1.1.1.4 Advantages and limits of PCA: Advantages: fast, easy to use and intuitive. Limits: Can lead to local inconsistency, i.e. far away points can become nearest neighbours. It is a linear projection, like casting a shadow, meaning it can’t capture non-linear dependencies. For instance, PCA would not be able to “unroll†the following structure. 9.1.1.2 GLM-PCA (Collins, Dasgupta, and Schapire 2002) (Townes et al. 2019) GLM-PCA is a generalized version of the traditional PCA. The traditional PCA implicitly imposes an assumption of Gaussian distribution. The purpose of GLM-PCA is to loosen this condition to accommodate other distributions of the exponential family. Why does PCA assume a Gaussian distribution? Let \\(x_1, \\dots, x_n \\in \\mathcal{R}^d\\) be the \\(d\\)-dimensional data observed. PCA is looking for their projections onto a subspace: \\(u_1, \\dots, u_n\\), such that \\(\\sum_{i = 1}^n \\Vert x_i - u_i\\Vert^2\\) is minimized. This objective function can be interpretated in two ways: Interpretation 1: the variance of the projections/principal components: \\(\\sum_{i} \\Vert u_i \\Vert ^2\\), if the data is centered at the origin (\\(\\sum_{i} x_i = 0\\)); Interpretation 2: Each point \\(x_i\\) is thought of as a random draw from a probability distribution centered at \\(u_i\\). If we take this probability as a unit Gaussian, that is \\(x_i \\sim N(u_i, 1)\\), then the likelihood is \\(\\prod_{i = 1}^n \\exp (- \\Vert x_i - u_i\\Vert^2)\\), and the negative log likelihood is exactly the objective function. This assumption is often inappropriate for non-Gaussian distributed data, for example discrete data. Therefore, GLM-PCA generalizes the Gaussian likelihood into a likelihood of any exponential-family distribution, and applies appropriate link functions to \\(u_i\\)’s in the same as a GLM does to non-Gaussian responses. The following example compares GLM-PCA with Poisson marginals to the traditional PCA, which is identical to the result from plotPCA. ## GLM-PCA Y <- assay(deng, "counts") Y <- Y[rowSums(Y) > 0, ] system.time(res1 <- glmpca(Y, L=2, fam="poi", verbose=TRUE)) ## user system elapsed ## 94.261 25.207 119.499 pd1 <- data.frame(res1$factors, dimreduce="glmpca-poisson", clust = factor(deng$cell_type2)) ## traditional PCA pd2 <- data.frame(reducedDim(deng, "PCA"), dimreduce="runPCA", clust = factor(deng$cell_type2)) colnames(pd2) <- colnames(pd1) ## plot pd <- rbind(pd1, pd2) ggplot(pd, aes(x = dim1, y = dim2, colour = clust)) + geom_point(size=2) + facet_wrap(~dimreduce, scales="free", nrow=3) + scale_color_manual(values = my_color2) + theme_bw() Let us compare GLM-PCA and standard PCA (using normalized log-counts data) on the Tung data, before cells have been QC’d. Repeat these plots with the QC’d Tung data. 9.1.2 tSNE: t-Distributed Stochastic Neighbor Embedding t-SNE (Maaten and Hinton 2008) is an advanced version of the original SNE algorithm. (Hinton and Roweis 2003) 9.1.2.1 Motivation The weakness of PCA is the motivation behind the SNE algorithm. PCA focuses on global covariance structrue, which lead to local inconsistency. SNE aims to preserve local strucutrue, or preserving the relationships among data points (ie. similar points remain similar; distinct points remain distinct). Unlike PCA, SNE is not limited to linear projections, which makes it suited to all sorts of datasets, including the swiss-roll data we have seen above. t-SNE solves the crowding issue of the original SNE. 9.1.2.2 original SNE SNE minimizes the divergence between two distributions: a distribution that measures pairwise similarities of the input objects and a distribution that measures pairwise similarities of the corresponding low-dimensional points in the embedding. Goal: preserve neighbourhoods. Soft neighbourhood: For each data point \\(x_i\\), the \\(i\\rightarrow j\\) probability is the probability that point \\(x_i\\) chooses \\(x_j\\) as its neighbour: \\(p_{j|i} \\propto \\exp(-\\Vert x_i - x_j \\Vert^2/2\\delta^2)\\). (This can be thought of as the probability of \\(x_j\\) in \\(N(x_i, \\delta)\\)) \\(\\Vert x_i - x_j \\Vert^2\\) is the Euclidean distance: The closer \\(x_i\\) and \\(x_j\\) are, the larger \\(p_{j|i}\\) is. \\(\\delta^2\\) denotes the vairance, it sets the size of the neighbourhood.  Very low \\(\\Rightarrow\\) all the probability is in the nearest neighbour  Very high \\(\\Rightarrow\\) uniform weights We generally want \\(\\delta^2\\) to be small for points in densely populated areas and large for sparse areas, so that the number of neighbours of all data points are roughly the same. It is computed with a user specified parameter (perplexity) which indicates the effective number of neighbours for a data point. Similarity matrix Collect \\(p_{j|i}\\) for all data points into a matrix, then this matrix preserves the key information of the local neighbourhood structure. How SNE works: Given high-dimensional data \\(X = \\{x_1, \\dots, x_n \\in \\mathcal{R}^d \\}\\), obtain the similarity matrix \\(P\\); Let \\(Y =\\{y_1, \\dots, y_n \\in \\mathcal{R}^2\\}\\) be a 2-dimensional data, the coordinates for visualization. Obtain a smilarity matrix of \\(Y\\), denoted as \\(Q\\), in the same way as \\(X\\), except that \\(\\delta^2\\) is fixed at 1/2. Look for \\(Y\\) such that \\(Q\\) is as similar to \\(P\\) as possible. Measurement of how similar two distributions is: Kullback-Leibler divergence (The definition of this cost function and the optimization procedure are out of the scope of this course) 9.1.2.3 t-SNE The motivation of t-SNE is to solve one of the main issues of SNE, the crowding problem. Crowding problem: In high dimension we have more room, points can have a lot of different neighbours that are far apart from each other. But in low dimensions, we don’t have enough room to accommodate all neighbours. Forexample, in 2D a point can have a few neighbors at distance one all far from each other - what happens when we embed in 1D? Solution: Change the distribution of the low-dimensional data \\(Q\\) into a student-t distribution. Recall that SNE is trying to minimize the dissimilarity of \\(P\\) and \\(Q\\), and \\(P\\) has a Gaussian distribution. So for a pair of points (\\(x_i\\) and \\(x_j\\) in high-dimension, \\(y_i\\) and \\(y_j\\) in low-dimension) to reach the same probability, the distance between \\(y_i\\) and \\(y_j\\) would be much larger (i.e. much farther apart). 9.1.2.4 Example of t-SNE: muraro <- readRDS("data/pancreas/muraro.rds") tmp <- runTSNE(muraro, perplexity = 3) plotTSNE(tmp, colour_by = "cell_type1") tmp <- runTSNE(muraro, perplexity = 50) plotTSNE(tmp, colour_by = "cell_type1") 9.1.2.5 Limits of t-SNE: Not a convex problem, i.e. the cost function has multiple local minima. Non-deterministic. Require specification of very important parameters, e.g. perplexity. Coordinates after embedding have no meaning. Therefore can merely be used for visualization. (See here for more pitfalls of using t-SNE.) 9.1.3 Manifold methods 9.1.3.1 UMAP: Uniform Manifold Approximation and Projection (McInnes, Healy, and Melville 2018) 9.1.3.1.1 Advantages of UMAP over t-SNE: faster deterministic better at preserving clusters 9.1.3.1.2 High level description Construct a topological presentation of the high-dimensional data (in this case a weighted \\(k\\)-NN graph) Given a low-dimensional data, construct a graph in the similar way Minimize the dissimilarity between the twp graphs. (Look for the low-dimensional data whose graph is the closest to that of the high-dimensional data) 9.1.3.1.3 Some details How the weighted graph is built? Obtain dissimilarity from the input distance: For each data point \\(x_i\\), find its \\(k\\) nearest neighbours: \\(x_{i_1}, \\dots, x_{i_k}\\). Let \\(d(x_i, x_{i_j})\\) be the input or original distance between \\(x_i\\) and \\(x_{i_j}\\), and \\(\\rho_i = \\min[d(x_i, x_{i_j}); 1 \\leq j \\leq k]\\) be the distance between \\(x_i\\) and its nearest neighbour. Then the dissimilarity between \\(x_i\\) and \\(x_{i_j}\\) is measured simply by subtracting the original distance by \\(\\rho_i\\): \\(\\tilde{d}(x_i, x_{i_j}) = d(x_i, x_{i_j}) - \\rho_i\\). Tranform dissimilarity to similarity: \\(s(x_i, x_{i_j}) = \\exp[-\\tilde{d}(x_i, x_{i_j})] - c_i\\), where \\(c_i\\) is a scale factor to ensure \\(\\sum_{j = 1}^k s(x_i, x_{i_j})\\) is a constant for all \\(i\\). Similarity itself can serve as edge weights, but this similarity is not symmetrical, i.e. \\(s(x_i, x_{i_j}) \\neq s(x_{i_j}, x_i)\\). To be able to project this onto an undirected graph, we need to solve the disagreement between \\(s(x_i, x_{i_j})\\) and \\(s(x_{i_j}, x_i)\\). Obtain weights: \\(w(x_i, x_{i_j}) = s(x_i, x_{i_j}) + s(x_{i_j}, x_i) - s(x_i, x_{i_j}) * s(x_{i_j}, x_i)\\) (Interpretation: \\(P(A \\cup B ) = P(A) + P(B) - P(A)P(B)\\) if \\(A\\) and \\(B\\) are independent) How the dissimilarity between graphs are measured? Cross entropy 9.1.3.1.4 Example of UMAP muraro <- runUMAP(muraro) plotUMAP(muraro, colour_by="cell_type1") 9.1.3.2 PHATE (Moon et al. 2017) 9.1.3.2.1 Sketch of algorithm The simpliest description of PHATE: Step1. Create a dissimilarity matrix of the original data Step2. Feed the dissimilarity matrix to nonmetric MDS (MDS: Multi-Dimension Scaling is a classical dimensionality reduction approach, that takes an input of distance matrix, and aims at preserving pairwise distances in the low dimensional space. When the input distance matrix is Euclidean distance, MDS produces the same result as PCA. Nonmetric MDS generalize the input as a dissimilarity matrix, rather than just distance.) Details of step1 in PHATE Step1-1. Markov transition matrix - What is similar with SNE: Recall that in the original SNE algorithm, there is a similarity matrix with entry \\(p_{i|j}\\) that is interpreted as the probability that point \\(x_i\\) chooses \\(x_j\\) as its neighbour: \\(p_{j|i} \\propto \\exp(-\\Vert x_i - x_j \\Vert^2/2\\delta^2)\\). PHATE is doing the same, except that we can interpret it differently: i. We can think \\(p_{j|i}\\) as a Gaussian kernel, where \\(\\epsilon \\triangleq 2\\delta^2\\) is the bandwidth: \\(p_{j|i} \\triangleq K_\\epsilon(x_i, x_j )\\). Similar to SNE, PHATE also define \\(\\epsilon\\) as the \\(k\\)-NN distance of each data point, so that it is smaller in dense area and larger in sparse area. The \\(k\\) is a user-specified tuning parameter, similar to perplexity in SNE. ii. We can think of the similarity matrix as a transition matrix, where \\(p_{j|i}\\) represents the probability of jumping from state \\(i\\) to state \\(j\\) in a single step. - What is different: i. PHATE generalize \\(K_\\epsilon(x_i, x_j)\\) to \\(\\exp \\left(- \\Vert x_i - x_j \\Vert^\\alpha /\\epsilon(x_i)^\\alpha\\right)\\), where the original Gaussian kernel is the special case when \\(\\alpha = 2\\). The motivation is that if the data is very sparse in some regions, then the bandwith \\(\\epsilon\\) with be very large and the kernel will become flat and lose the local information. By letting \\(\\alpha > 2\\), we prevent this to happen, although \\(\\alpha\\) needs to be provided by the user.  ii. Note that the kernels are not symmetrical now, that is \\(K_\\epsilon(x_i, x_j) \\neq K_\\epsilon(x_j, x_i)\\). So we make it symmetrical by taking an average of the two.  Step1-2. Smoothing - \\(P\\) is the transition matrix where \\(p_{i, j}\\) represents the probability of jumping from state \\(i\\) to state \\(j\\) in a single step. - Denote \\(\\delta_x\\) as a row vector of length \\(n\\) (the number of data points), where only the entry corresponding to \\(x\\) is 1 and zero everywhere else. Then \\(p_x = \\delta_x P\\) is the probability distribution of the data points starting from \\(x\\) after one step, and \\(p_x^t = \\delta_x P^t\\) is the probability distribution of the data points after \\(t\\) steps. In general, the more steps we take, the more data points will have positive probabilities. One way to think about this, is the larger \\(t\\) is, the more global information and the less local information is encoded. In the extreme case, if we take infinity steps, \\(p_x^\\infty\\) will be the same for all \\(x\\)’s, i.e. the probability distribution is going to be the same regardless of where we start, in this case, the local information is completely lost. An appropriately chosen \\(t\\) is crucial for the balance between local and global information in the embedding. (See the original paper for details of choosing \\(t\\)) Step1-3. Distance measurement Instead of measuring directly the Eudlicean distance between data points, say \\(x_i\\) and \\(x_j\\), PHATE measures the distance between probability distributions \\(p_{x_i}^t\\) and \\(p_{x_j}^t\\): \\(D^t(x_i, x_j) = \\Vert \\log(p_{x_i}^t) - \\log(p_{x_j}^t) \\Vert^2\\) 9.1.3.2.2 Example of PHATE library(phateR) deng_phate <- phate(t(assay(deng, "logcounts"))) dt <- data.frame(deng_phate$embedding, clust = deng$cell_type1) palette(rainbow(10)) ggplot(dt, aes(x=PHATE1, y=PHATE2, color=clust)) + geom_point() 9.2 Matrix factorization and factor analysis The key concept of factor analysis: The original, observed variables are correlated because they are all associated with some unobservable variables, the latent factors. It looks similar to PCA, but instead of dimensionality reduction, factor analysis focuses on studying the latent factors. The variance of an observed variable can be splitted into two parts: - Common variance: the part of variance that is explained by latent factors; - Unique variance: the part that is specific to only one variable, usually considered as an error component or residual. The factor loadings or weights indicate how much each latent factor is affecting the observed features. 9.2.1 Slalom: Interpretable latent spaces Highlight of Slalom: (Buettner et al. 2017) It incorporates prior information to help the model estimation; It learns whatever not provided by prior knowledge in the model training process; It enforces sparsity in the weight matrix. 9.2.1.1 Methodology Matrix expression of factor analysis: How prior knowledge affects the model: \\(I_{g, k}\\): (observed) Indicator of whether a gene \\(g\\) is annotated to a given pathway or factor \\(k\\); \\(z_{g, k}\\): (latent) Indicator of whether factor \\(k\\) has a regulatory effect on gene \\(g\\); \\(w_{g, k}\\): (estimated) weights. grey arrow: \\[ P(I_{g, k}\\vert z_{g, k}) = \\begin{cases} \\text{Bernoulli}(p_1), \\text{if } z_{g, k} = 1\\\\ \\text{Bernoulli}(p_2), \\text{if } z_{g, k} = 0\\\\ \\end{cases}\\] green arrow: \\[ P(w_{g, k}\\vert z_{g, k}) = \\begin{cases} N(w_{g, k}, 1/\\alpha), \\text{ if } z_{g, k} = 1\\\\ \\delta_0(w_{g, k}), \\text{ if } z_{g, k} = 0\\\\ \\end{cases}\\] We only look at the part of the likelihood that is relavant to this part: \\(\\prod_{g} \\prod_{k}P(I_{g, k}, w_{g, k}, z_{g, k})\\), where \\(P(I_{g, k}, w_{g, k}, z_{g, k}) = P(I_{g, k}, w_{g, k}| z_{g, k})P(z_{g,k}) = P( I_{g, k}| z_{g, k})P( w_{g, k}| z_{g, k})P(z_{g,k})\\). Since we do not know anything about \\(z_{g,k}\\), it is assumed as Bernoulli(1/2). 9.2.1.2 Example First, get a geneset in a GeneSetCollection object. gmtfile <- system.file("extdata", "reactome_subset.gmt", package = "slalom") genesets <- GSEABase::getGmt(gmtfile) Then we create an Rcpp_SlalomModel object containing the input data and genesets (and subsequent results) for the model. model_deng <- newSlalomModel(deng, genesets, n_hidden = 5, min_genes = 10) ## 29 annotated factors retained; 1 annotated factors dropped. ## 1072 genes retained for analysis. Initialize the model: model_deng <- initSlalom(model_deng, seed = 100) Fit/train the model: model_deng <- trainSlalom(model_deng, nIterations = 1000, seed = 100, tolerance = 0.001) ## pre-training model for faster convergence ## iteration 0 ## Model not converged after 50 iterations. ## iteration 0 ## Model not converged after 50 iterations. ## iteration 0 ## Switched off factor 29 ## Switched off factor 20 ## Switched off factor 32 ## Switched off factor 28 ## Switched off factor 13 ## Switched off factor 27 ## Switched off factor 10 ## iteration 100 ## Switched off factor 22 ## iteration 200 ## iteration 300 ## iteration 400 ## iteration 500 ## iteration 600 ## iteration 700 ## Model converged after 701 iterations. View results: The plotRelevance function displays the most relevant terms (factors/pathways) ranked by relevance, showing gene set size and the number of genes gained/lost as active in the pathway as learnt by the model. plotRelevance(model_deng) The plotTerms function shows the relevance of all terms in the model, enabling the identification of the most important pathways in the context of all that were included in the model. plotTerms(model_deng) 9.3 Autoencoders (Kingma and Welling 2013) 9.3.1 Background and some notations Data: \\(X\\) Latent variables: \\(Z\\) Something that is not directly observable but is assumed to have an impact on the observed variables. Goal: We believe \\(X\\) can be generated from \\(Z\\) (with some trasformation), and want to sample more data from \\(Z\\) that resembles \\(X\\). So we want the to find the parameters \\(\\theta\\) such that the probability to generate \\(X\\) from the distribution of \\(Z\\): \\(P(X) = \\int P(X|z; \\theta) P(z) dz\\) is maximized. How do we define \\(Z\\)? -The simpliest idea: \\(Z \\sim N(0, 1)\\). It is not impossible, because “any distribution in d dimensions can be generated by taking a set of d variables that are normally distributed and mapping them through a sufficiently complicated function.†-A better idea: For most of \\(z\\), \\(P(X|z; \\theta)\\) will be close to zero, meaning it contribute almost nothing to the estimate of \\(P(X)\\). Thus, we want to sample only those values of \\(Z\\) that are likely to produce \\(X\\). Denote this distribution of \\(Z\\) as \\(Q(Z|X)\\) (it is infered and therefore depend on \\(X\\)). Advantage: There will be a lot less possible values of \\(Z\\) under \\(Q\\) compared to random sampling, therefore, it will be easier to compute \\(E_{Z \\sim Q} P(X|Z)\\). 9.3.2 Objective \\[ \\log P(X) - KL[Q(Z|X)\\Vert P(Z|X)] = E_{Z\\sim Q}[\\log P(X|Z)] - KL[Q(Z|X)\\Vert P(Z)]\\] We can get this equation by starting from the definition of Kullback-Leibler divergence, combined with the Bayesian formula and a little algebra. (Not showing details here) LHS: what we want to maximize: Generation loss: how likely the generated samples resembles \\(X\\) - an error term which measures how much information is lost when using \\(Q\\) to represent \\(P\\), it becomes small if \\(Q\\) is high-capacity. (A loose explanation of model capacity: Roughly speaking, the capacity of a model describes how complex a relationship it can model. You could expect a model with higher capacity to be able to model more relationships between more variables than a model with a lower capacity.) RHS: what we can maximize through stochastic gradient descent. References "], +["clustering-and-cell-annotation.html", "10 Clustering and cell annotation 10.1 Clustering Methods 10.2 Clustering example 10.3 An alternative to clustering: Automatic cell annotation", " 10 Clustering and cell annotation 10.1 Clustering Methods Once we have normalized the data and removed confounders we can carry out analyses that are relevant to the biological questions at hand. The exact nature of the analysis depends on the dataset. Nevertheless, there are a few aspects that are useful in a wide range of contexts and we will be discussing some of them in the next few chapters. We will start with the clustering of scRNA-seq data. 10.1.1 Introduction One of the most promising applications of scRNA-seq is de novo discovery and annotation of cell-types based on transcription profiles. Computationally, this is a hard problem as it amounts to unsupervised clustering. That is, we need to identify groups of cells based on the similarities of the transcriptomes without any prior knowledge of the labels. Moreover, in most situations we do not even know the number of clusters a priori. The problem is made even more challenging due to the high level of noise (both technical and biological) and the large number of dimensions (i.e. genes). When working with large datasets, it can often be beneficial to apply some sort of dimensionality reduction method. By projecting the data onto a lower-dimensional sub-space, one is often able to significantly reduce the amount of noise. An additional benefit is that it is typically much easier to visualize the data in a 2 or 3-dimensional subspace. We have already discussed PCA (chapter 6.6.2) and t-SNE (chapter 6.6.2). Challenges in clustering What is the number of clusters k? What defines a good clustering? What is a cell type? Scalability: in the last few years the number of cells in scRNA-seq experiments has grown by several orders of magnitude from ~\\(10^2\\) to ~\\(10^6\\) 10.1.2 unsupervised Clustering methods Three main ingredients of a complete clustering method: Measure of similarity: how do we quantify how close two data points are? Quality function: how do we decide how “good†is a clustering/partition? Algorithm: how to find the clustering whose quality function is optimized? 10.1.2.1 Hierarchical clustering Hierarchical clustering is basically the only type of clustering algorithm that does not seek to optimize a quality function, because it builds a hierarchy of clusters, instead of one single clustering result as the output. There are two types of strategies: - Agglomerative (bottom-up): each observation starts in its own cluster, and pairs of clusters are merged as one moves up the hierarchy. - Divisive (top-down): all observations start in one cluster, and splits are performed recursively as one moves down the hierarchy.  10.1.2.2 k-means clustering Measure of similarity: Euclidean distance Quality function: Within cluster distance Algorithm: Advantage: Fast Drawbacks: - Sensitive to initial clustering - Sensitive to outliers - Need to specify K - Tend to find clusters of similar sizes Tools related to K-means: SC3 10.1.2.3 Graph-based methods Real world networks usualy display big inhomogeneities or community structure. Communities or clusters or modules are groups of vertices which probably share common properties and/or play similar roles whithin the graph. In recent years there has been a lot of interest in detecting communities in networks in various domains. Some of these community detection methods can be applied to scRNA-seq data by building a graph where each vertice represents a cell and (weight of) the edge measures similarity between two cells. Actually, graph-based clustering is the most popular clustering algorithm in scRNA-seq data analysis, and has been reported to have outperformed other clustering methods in many situations (Freytag et al. 2018). 10.1.2.3.1 Why do we want to represent the data as a graph? Memory effectiveness: A (complete) graph can be thought as an alternative expression of similarity matrix. Current methods (discuss later) aim to build sparse graphs, which ease the memory burden. Curse of dimensionality: All data become sparse in high-dimensional space and therefore similarities measured by Euclidean distances etc are generally low between all objects. 10.1.2.3.2 Building a graph Step1: Build an unweighted K-nearest neighbour (KNN) graph Step2: Add weights, and obtain a shared nearest neighbour (SNN) graph There are two ways of adding weights: number and rank. - number: The number of shared nodes between \\(u\\) and \\(v\\), in this case, 3. - rank: A measurement of the closeness to their common nearest neighbours. (Xu and Su (2015)) Details of rank : Main idea: The closeness of two people is defined by their closest common friend. For each node, say \\(u\\), we can rank its 5 neighbours according to their closeness to \\(u\\), and we can do the same with \\(v\\). Denote the three shared neighbours as \\(x_1\\), \\(x_2\\) and \\(x_3\\), so rank(\\(x_1, u\\)) = 1 means \\(x_1\\) is the closest neighbour of \\(u\\). The idea is, if \\(x_1\\) is also the closest to \\(v\\), then \\(u\\) and \\(v\\) should have a larger similarity, or weight. So we summarize the overall closeness of \\(x_1\\) with both \\(u\\) and \\(v\\) by taking an average: \\(\\dfrac{1}{2}(\\text{rank}(x_1, u), \\text{rank}(x_1, v))\\). Then we find the one with the largest closeness, \\(s(u, v) = \\min \\left[ \\dfrac{1}{2}(\\text{rank}(x_i, u), \\text{rank}(x_i, v)) \\vert i = 1, 2, 3\\right]\\). The final expression of weight: \\[ w(u, v) = K - s(u, v).\\] 10.1.2.3.3 Quality function (Modularity) Modularity (Newman and Girvan 2004) is not the only quality function for graph-based clustering, but it is one of the first attempts to embed in a compact form many questions including the definition of quality function and null model etc. The idea of modularity: A random graph should not have a cluster structure. The more “quality†a partition has compared to a random graph, the “better†the partition is. Specifically, it is defined by: the quality of a partition on the actual graph \\(-\\) the quality of the same partition on a random graph quality : Sum of the weights within clusters random graph : a copy of the original graph, with some of its properties, but without community structure. The random graph defined by modularity is: each node has the same degree as the original graph. \\[ Q \\propto \\sum_{i, j} A_{i, j} \\delta(i, j) - \\sum_{i, j} \\dfrac{k_i k_j}{2m} \\delta(i, j)\\] \\(A_{i, j}\\): weight between node \\(i\\) and \\(j\\); \\(\\delta(i, j)\\): indicator of whether \\(i\\) and \\(j\\) are in the same cluster; \\(k_i\\): the degree of node \\(i\\) (the sum of weights of all edges connected to \\(i\\)); \\(m\\): the total weight in the all graph. Higher modularity implies better partition: Limits of modularity: (Good, De Montjoye, and Clauset 2010) 1. Resolution limit. Short version: Modularity maximization forces small communities into larger ones. Longer version: For two clusters \\(A\\) and \\(B\\), if \\(k_A k_B < 2m\\) then modularity increases by merging A and B into a single cluster, even if A and B are distinct clusters. 2. Bad, even random partitions may have a high modularity. Networks lack a clear modularity maxima. 10.1.2.3.4 Algorithms : Modularity-based clustering methods implemented in single cell analysis are mostly greedy algorithms, that are very fast, although not the most accurate approaches.   Louvain: (Blondel et al. 2008)   Leiden:(Traag, Waltman, and Eck 2019) Improved Louvain, hybrid of greedy algorithm and sampling technique 10.1.2.3.5 Advantages: -Fast -No need to specify \\(k\\) 10.1.2.3.6 Tools for graph-based clustering: Seurat: Louvain, Leiden, SLM igraph: fast greedy, Louvain, optimal, walktrap, spinglass, infomap 10.1.2.4 Concensus clustering (more robustness, less computational speed) 10.1.2.4.1 Motivation (Two problems of \\(K\\)-means): Problem1: sensitive to initial partitions  Solution:  Run multiple iterations of \\(K\\)-means on different subsamples of the original dataset, with different initail partitions. Problem2: the selection of \\(K\\).  Solution:  Run \\(K\\)-means with a range of \\(K\\)’s. 10.1.2.4.2 Algorithm of concensus clustering (simpliest version): for(k in the range of K){ for(each subsample of the data){ for(iteration in 1:1000){ kmeans(subsample, k) # each iteration means a different initial partition save partition } } return consensus clustering result of k } 10.1.2.4.3 Subsample obtained by dimensional reduction: steps of PCA: i) tranformation of the similarity matrix. ii) ranking eigen vectors according to their accoring eigen values in decreasing order. iii) need to decide how many (\\(d\\)) PC’s or eigenvalues we wants to reduce to. In SC3, i) considers two types of transformation: the one with traditional PCA and the associated graph Laplacian. iii) User may specify a range of \\(d\\), or use the default range suggested by the authors according to their experience with empirical results. 10.1.2.4.4 Consensus clustering (combining multiple clustering results): Step1: Represent each partition as a matrix: Say we partitioned four data points into 2 clusters. Step2: Concensus matrix: Average of all the partitions 10.1.2.4.5 Tools for consensus clustering: SC3 10.2 Clustering example library(pcaMethods) library(SC3) library(scater) library(SingleCellExperiment) library(pheatmap) library(mclust) library(igraph) library(scran) 10.2.1 Example 1. Graph-based clustering (deng dataset) To illustrate clustering of scRNA-seq data, we consider the Deng dataset of cells from developing mouse embryo (Deng et al. 2014). We have preprocessed the dataset and created a SingleCellExperiment object in advance. We have also annotated the cells with the cell types identified in the original publication (it is the cell_type2 column in the colData slot). deng <- readRDS("data/deng/deng-reads.rds") First, we build a \\(K\\)-NN graph with a package function from scran. The most important decision of building a graph is the choice of \\(K\\), of which there is no standard rule. In general, we can think of it as an indication of the desired cluster size. If \\(K\\) is too small, a genuine cluster might be split into parts, while if \\(K\\) is too large, clusters might not thoroughly separated. deng5 <- buildSNNGraph(deng, k = 5) deng15 <- buildSNNGraph(deng, k = 15) deng25 <- buildSNNGraph(deng, k = 25) par(mfrow=c(1,3)) plot(deng5, vertex.size = 4, vertex.label = NA) title("5-NN" ,line = -33, cex.main = 3) plot(deng15, vertex.size = 4, vertex.label = NA) title("15-NN" ,line = -33, cex.main = 3) plot(deng25, vertex.size = 4, vertex.label = NA) title("25-NN" ,line = -33, cex.main = 3) Perform Louvain clustering: cl <- igraph::cluster_louvain(deng15)$membership colData(deng)$cl <- factor(cl) mclust::adjustedRandIndex(colData(deng)$cell_type2, colData(deng)$cl) ## [1] 0.4197754 Reaches very high similarity with the labels provided in the original paper. However, it tend to merge small clusters into larger ones. table(deng$cell_type2, cl) ## cl ## 1 2 3 ## 16cell 49 0 1 ## 4cell 0 14 0 ## 8cell 36 0 1 ## early2cell 0 8 0 ## earlyblast 0 0 43 ## late2cell 0 10 0 ## lateblast 0 0 30 ## mid2cell 0 12 0 ## midblast 0 0 60 ## zy 0 4 0 10.2.2 Example 2. Graph-based clustering (segerstolpe dataset) muraro <- readRDS("data/pancreas/muraro.rds") ## PCA var.fit <- suppressWarnings(trendVar(muraro, parametric=TRUE, use.spikes=F)) muraro <- suppressWarnings(denoisePCA(muraro, technical=var.fit$trend)) dim(reducedDim(muraro, "PCA")) ## [1] 2126 5 ## Build graph and clustering gr <- buildSNNGraph(muraro, use.dimred="PCA", k = 30) cl <- igraph::cluster_louvain(gr)$membership colData(muraro)$cl <- factor(cl) mclust::adjustedRandIndex(colData(muraro)$cell_type1, colData(muraro)$cl) ## [1] 0.4845618 table(muraro$cell_type1, cl) ## cl ## 1 2 3 4 5 6 7 8 9 ## acinar 0 0 0 0 0 0 218 0 1 ## alpha 202 306 274 5 15 9 1 0 0 ## beta 1 0 0 5 195 21 2 220 4 ## delta 0 0 0 0 18 174 0 1 0 ## ductal 0 0 0 215 0 1 7 3 19 ## endothelial 0 0 0 0 0 0 0 0 21 ## epsilon 0 0 0 0 0 3 0 0 0 ## gamma 1 0 1 0 0 97 2 0 0 ## mesenchymal 0 0 0 1 0 0 0 0 79 ## unclear 0 0 0 4 0 0 0 0 0 10.2.3 Example 3. SC3 Let’s run SC3 clustering on the Deng data. The advantage of the SC3 is that it can directly ingest a SingleCellExperiment object. SC3 can estimate a number of clusters: deng <- sc3_estimate_k(deng) ## Estimating k... metadata(deng)$sc3$k_estimation ## [1] 6 Next we run SC3 (we also ask it to calculate biological properties of the clusters): deng <- sc3(deng, ks = 10, biology = TRUE, n_cores = 1) ## Setting SC3 parameters... ## Calculating distances between the cells... ## Performing transformations and calculating eigenvectors... ## Performing k-means clustering... ## Calculating consensus matrix... ## Calculating biology... SC3 result consists of several different outputs (please look in (Kiselev et al. 2017) and SC3 vignette for more details). Here we show some of them: Consensus matrix: sc3_plot_consensus(deng, k = 10, show_pdata = "cell_type2") Silhouette plot: sc3_plot_silhouette(deng, k = 10) Heatmap of the expression matrix: sc3_plot_expression(deng, k = 10, show_pdata = "cell_type2") Identified marker genes: sc3_plot_markers(deng, k = 10, show_pdata = "cell_type2") PCA plot with highlighted SC3 clusters: plotPCA(deng, colour_by = "sc3_10_clusters") Compare the results of SC3 clustering with the original publication cell type labels: adjustedRandIndex(colData(deng)$cell_type2, colData(deng)$sc3_10_clusters) ## [1] 0.7796181 Note SC3 can also be run in an interactive Shiny session: sc3_interactive(deng) This command will open SC3 in a web browser. Note Due to direct calculation of distances SC3 becomes very slow when the number of cells is \\(>5000\\). For large datasets containing up to \\(10^5\\) cells we recomment using Seurat (see chapter 16). 10.3 An alternative to clustering: Automatic cell annotation 10.3.1 SingleR 10.3.1.1 Methodology Step1. Find variable gene 1-1. For every gene, obtain median grouped by label. 1-2. Select genes that makes at least one label different:. For example, if we are looking for the genes that makes label “green†different from label “redâ€, we substract the second column by the first, and pick the top \\(N\\) highest and positive values. All analysis onwards use only the selected variable genes. Step2. Spearman’s correlation Spearman’s correlation \\(\\in [-1, 1]\\) is a measure of the strength of a linear or monotonic relationship between paired data. We compute the Spearman’s correlation for all pairs of cells in the test and reference dataset, and obtain an \\(n_{\\text{test}} \\times n_{\\text{ref}}\\) correlation matrix, where \\(n\\) is the number of cells (see the first matrix in Step3). Step3. Scoring We want to know how each cell in the test data is correlated to the labels in the reference data, instead of each reference cell. So we take the correlations of a cell in the test data with all the cells with a certain label in the reference data, and summarize them into one number or a score, in SingleR, the default is to take the \\(80\\%\\) quantile. Step4. Fine tuning We stop here and assign each cell with label that score the highest, actually, if we set the argument fine.tune = FALSE, that is exactly what the package function SingleR does. But there is one more question, what if the second highest score is very close to the highest? say, 1, 1, 1, 9.5, 10. SingleR set a threshold to define how close is “very closeâ€, the default is 0.05. For (only) the cells that falls into this category, it goes back to Step2. 10.3.1.2 Example (Note: SingleR is not yet available in the released version of Bioconductor. It will be possible to run it as shown once the next Bioconductor release is made in late October.) library(scRNAseq) library(SingleR) segerstolpe <- readRDS("data/pancreas/segerstolpe.rds") sceM <- suppressMessages(MuraroPancreasData()) sceM <- sceM[,!is.na(sceM$label)] sceM <- logNormCounts(sceM) ## find common gene rownames(sceM) <- gsub("__.*","",rownames(sceM)) common <- intersect(rownames(sceM), rownames(segerstolpe)) sceM <- sceM[common,] segerstolpe <- segerstolpe[common,] ## Prepare reference out <- pairwiseTTests(logcounts(sceM), sceM$label, direction="up") markers <- getTopMarkers(out$statistics, out$pairs, n=10) ## Annotation pred <- SingleR(test=segerstolpe, ref=sceM, labels=sceM$label, genes=markers) ## View result plotScoreHeatmap(pred, show.labels = TRUE, annotation_col=data.frame( row.names=rownames(pred))) 10.3.2 scmap ## Load data segerstolpe <- readRDS("data/pancreas/segerstolpe.rds") # test library(scRNAseq) sceM <- readRDS("data/pancreas/muraro.rds") # reference rownames(sceM) <- gsub("__.*","",rownames(sceM)) Select the most informative features (genes) using the dropout feature selection method. By default select 500 features. library(scmap) rowData(sceM)$feature_symbol <- rownames(sceM) sceM <- selectFeatures(sceM, suppress_plot = TRUE) Index of a reference dataset is created by finding the median gene expression for each cluster. First, chop the total of 500 features into \\(M = 50\\) chuncks/ low-dimensional subspace. Second, cluster each chunk into \\(k = \\sqrt{N}\\) clusters, where \\(N\\) is the number of cells. By default scmap uses the cell_type1 column of the colData slot in the reference to identify clusters. sceM <- indexCell(sceM) The function indexCluster writes the scmap_cluster_index item of the meta data slot of the reference dataset sceM. This step has two outputs: names(metadata(sceM)$scmap_cell_index) ## [1] "subcentroids" "subclusters" subcentroids returns cluster centers: cat(length(metadata(sceM)$scmap_cell_index$subcentroids), " chunks \\n") ## 50 chunks cat("The dimension of cluster centers in each chunk: ", dim(metadata(sceM)$scmap_cell_index$subcentroids[[1]]), "\\n") ## The dimension of cluster centers in each chunk: 10 46 subclusters contains information about which cluster (label) the cells belong to dim(metadata(sceM)$scmap_cell_index$subclusters) ## [1] 50 2126 metadata(sceM)$scmap_cell_index$subclusters[1:5,1:5] ## D28.1_1 D28.1_13 D28.1_15 D28.1_17 D28.1_2 ## [1,] 6 11 7 38 36 ## [2,] 1 16 17 44 38 ## [3,] 28 17 4 45 25 ## [4,] 43 41 40 33 22 ## [5,] 36 27 29 11 35 Projection: Once the scmap-cell indexes have been generated we can use them to project the test dataset. scmapCell_results <- scmapCell( projection = segerstolpe, index_list = list( sceM = metadata(sceM)$scmap_cell_index ) ) names(scmapCell_results) ## [1] "sceM" The cells matrix contains the top 10 (scmap default) cell IDs of the cells of the reference dataset that a given cell of the projection dataset is closest to: dim(scmapCell_results$sceM$cells) ## [1] 10 3514 Cell annotation: If cell cluster annotation is available for the reference datasets, scmap-cell can also annotate the cells from the projection dataset using the labels of the reference. It does so by looking at the top 3 nearest neighbours (scmap default) and if they all belong to the same cluster in the reference and their maximum similarity is higher than a threshold (0.5 is the scmap default), then a projection cell is assigned to the corresponding reference cluster: scmapCell_clusters <- scmapCell2Cluster( scmapCell_results, list( colData(sceM)$cell_type1 )) Plot result Compare the annotated result with the original label in the segerstolpe dataset. plot( getSankey( segerstolpe$cell_type1, scmapCell_clusters$combined_labs, plot_height = 400 ) ) 10.3.3 sessionInfo() References "], +["trajectory-inference.html", "11 Trajectory inference 11.1 First look at Deng data", " 11 Trajectory inference library(SingleCellExperiment) library(TSCAN) library(M3Drop) library(monocle) library(destiny) library(scater) library(ggplot2) library(ggthemes) library(ggbeeswarm) library(corrplot) library(Polychrome) library(slingshot) library(SLICER) library(ouija) set.seed(1) In many situations, one is studying a process where cells change continuously. This includes, for example, many differentiation processes taking place during development: following a stimulus, cells will change from one cell-type to another. Ideally, we would like to monitor the expression levels of an individual cell over time. Unfortunately, such monitoring is not possible with scRNA-seq since the cell is lysed (destroyed) when the RNA is extracted. Instead, we must sample at multiple time-points and obtain snapshots of the gene expression profiles. Since some of the cells will proceed faster along the differentiation than others, each snapshot may contain cells at varying points along the developmental progression. We use statistical methods to order the cells along one or more trajectories which represent the underlying developmental trajectories, this ordering is referred to as “pseudotimeâ€. In this chapter we will consider five different tools: TSCAN,Slingshot,Monocle and some off-the-shelf methods like PCA, for ordering cells according to their pseudotime development. To illustrate the methods we will be using a dataset on mouse embryonic development (Deng et al. 2014). The dataset consists of 268 cells from 10 different time-points of early mouse development. In this case, there is no need for pseudotime alignment since the cell labels provide information about the development trajectory. Thus, the labels allow us to establish a ground truth so that we can evaluate and compare the different methods. A recent benchmarking paper by Saelens et al (Saelens et al. 2019) provides a detailed summary of the various computational methods for trajectory inference from single-cell transcriptomics (Saelens et al. 2019). They discuss 45 tools and evaluate them across various aspects including accuracy, scalability, and usability. The following figures from the paper summarise several key aspects and some of the features of the tools being evaluated: Figure 2.3: Overview of several key aspects of the evaluation (Fig. 1 from Saelens et al, 2019). The Characterizatics of the 45 TI tools: Figure 2.4: Characterization of trajectory inference methods for single-cell transcriptomics data (Fig. 2 from Saelens et al, 2019). The detailed evaluation results of the 45 TI tools: Figure 2.5: Detailed results of the four main evaluation criteria: accuracy, scalability, stability and usability of trajectory inference methods for single-cell transcriptomics data (Fig. 3 from Saelens et al, 2019). 11.1 First look at Deng data Let us take a first look at the Deng(Deng et al. 2014) data, without yet applying sophisticated pseudotime methods. As the plot below shows, simple PCA does a very good job of displaying the structure in these data. It is only once we reach the blast cell types (“earlyblastâ€, “midblastâ€, “lateblastâ€) that PCA struggles to separate the distinct cell types. deng_SCE <- readRDS("data/deng/deng-reads.rds") deng_SCE$cell_type2 <- factor( deng_SCE$cell_type2, levels = c("zy", "early2cell", "mid2cell", "late2cell", "4cell", "8cell", "16cell", "earlyblast", "midblast", "lateblast") ) cellLabels <- deng_SCE$cell_type2 deng <- counts(deng_SCE) colnames(deng) <- cellLabels deng_SCE <- scater::runPCA(deng_SCE,ncomponent = 5) ## change color Palette with library(Polychrome) set.seed(723451) # for reproducibility my_color <- createPalette(10, c("#010101", "#ff0000"), M=1000) names(my_color) <- unique(as.character(deng_SCE$cell_type2)) pca_df <- data.frame(PC1 = reducedDim(deng_SCE,"PCA")[,1], PC2 = reducedDim(deng_SCE,"PCA")[,2], cell_type2 = deng_SCE$cell_type2) ggplot(data = pca_df)+geom_point(mapping = aes(x = PC1, y = PC2, colour = cell_type2))+ scale_colour_manual(values = my_color)+theme_classic() PCA, here, provides a useful baseline for assessing different pseudotime methods. For a very naive pseudotime we can just take the co-ordinates of the first principal component. #deng_SCE$PC1 <- reducedDim(deng_SCE, "PCA")[,1] ggplot(pca_df, aes(x = PC1, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + scale_colour_manual(values = my_color) + theme_classic() + xlab("First principal component") + ylab("Timepoint") + ggtitle("Cells ordered by first principal component") As the plot above shows, PC1 struggles to correctly order cells early and late in the developmental timecourse, but overall does a relatively good job of ordering cells by developmental time. Can bespoke pseudotime methods do better than naive application of PCA? 11.1.1 TSCAN TSCAN (Ji and Ji 2019) combines clustering with pseudotime analysis. First it clusters the cells using mclust, which is based on a mixture of normal distributions. Then it builds a minimum spanning tree to connect the clusters. The branch of this tree that connects the largest number of clusters is the main branch which is used to determine pseudotime. Note From a connected graph with weighted edges, MST is the tree structure that connects all the nodes in a way that has the minimum total edge weight. The trajectory inference methods that use MST is based on the idea that nodes (cells/clusters of cells) and their connections represent the geometric shape of the data cloud in a two-dimenension space. First we will try to use all genes to order the cells. procdeng <- TSCAN::preprocess(counts(deng_SCE)) colnames(procdeng) <- 1:ncol(deng_SCE) dengclust <- TSCAN::exprmclust(procdeng, clusternum = 10) TSCAN::plotmclust(dengclust) dengorderTSCAN <- TSCAN::TSCANorder(dengclust, orderonly = FALSE) pseudotime_order_tscan <- as.character(dengorderTSCAN$sample_name) deng_SCE$pseudotime_order_tscan <- NA deng_SCE$pseudotime_order_tscan[as.numeric(dengorderTSCAN$sample_name)] <- dengorderTSCAN$Pseudotime Frustratingly, TSCAN only provides pseudotime values for 221 of 268 cells, silently returning missing values for non-assigned cells. Again, we examine which timepoints have been assigned to each state: cellLabels[dengclust$clusterid == 10] ## [1] late2cell late2cell late2cell late2cell late2cell late2cell late2cell ## [8] late2cell late2cell late2cell ## 10 Levels: zy early2cell mid2cell late2cell 4cell 8cell ... lateblast ggplot(as.data.frame(colData(deng_SCE)), aes(x = pseudotime_order_tscan, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + scale_color_manual(values = my_color) + theme_classic() + xlab("TSCAN pseudotime") + ylab("Timepoint") + ggtitle("Cells ordered by TSCAN pseudotime") TSCAN gets the development trajectory the “wrong way aroundâ€, in the sense that later pseudotime values correspond to early timepoints and vice versa. This is not inherently a problem (it is easy enough to reverse the ordering to get the intuitive interpretation of pseudotime), but overall it would be a stretch to suggest that TSCAN performs better than PCA on this dataset. (As it is a PCA-based method, perhaps this is not entirely surprising.) Exercise 1 Compare results for different numbers of clusters (clusternum). 11.1.2 Slingshot Slingshot (Street et al. 2018) is a single-cell lineage inference tool, it can work with datasets with multiple branches. Slingshot has two stages: 1) the inference of the global lineage structure using MST on clustered data points and 2) the inference of pseudotime variables for cells along each lineage by fitting simultaneous ‘principal curves’ across multiple lineages. Slingshot’s first stage uses a cluster-based MST to stably identify the key elements of the global lineage structure, i.e., the number of lineages and where they branch. This allows us to identify novel lineages while also accommodating the use of domain-specific knowledge to supervise parts of the tree (e.g., terminal cellular states). For the second stage, we propose a novel method called simultaneous principal curves, to fit smooth branching curves to these lineages, thereby translating the knowledge of global lineage structure into stable estimates of the underlying cell-level pseudotime variable for each lineage. Slingshot had consistently performing well across different datasets as reported by Saelens et al, let’s have a run for the deng dataset. It is recommended by Slingshot to run in a reduced dimensions. __Note_ Principal curves are smooth one-dimensional curves that pass through the middle of a p-dimensional data set, providing a nonlinear summary of the data. They are nonparametric, and their shape is suggested by the data (Hastie et al)(Hastie and Stuetzle 1989). ## runing slingshot deng_SCE <- slingshot(deng_SCE, clusterLabels = 'cell_type2',reducedDim = "PCA", allow.breaks = FALSE) ## Using diagonal covariance matrix summary(deng_SCE$slingPseudotime_1) ## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's ## 0.00 52.19 59.81 60.34 81.60 85.72 55 ## get lineages inferred by slingshot lnes <- getLineages(reducedDim(deng_SCE,"PCA"), deng_SCE$cell_type2) ## Using diagonal covariance matrix lnes@lineages ## $Lineage1 ## [1] "zy" "early2cell" "mid2cell" "late2cell" "4cell" ## [6] "16cell" "midblast" "earlyblast" ## ## $Lineage2 ## [1] "zy" "early2cell" "mid2cell" "late2cell" "4cell" ## [6] "16cell" "midblast" "lateblast" ## ## $Lineage3 ## [1] "zy" "early2cell" "mid2cell" "late2cell" "4cell" ## [6] "16cell" "8cell" ## plot the lineage overlay on the orginal PCA plot plot(reducedDims(deng_SCE)$PCA, col = my_color[as.character(deng_SCE$cell_type2)], pch=16, asp = 1) legend("bottomleft",legend = names(my_color[levels(deng_SCE$cell_type2)]), fill = my_color[levels(deng_SCE$cell_type2)]) lines(SlingshotDataSet(deng_SCE), lwd=2, type = 'lineages', col = c("black")) ## Plotting the pseudotime inferred by slingshot by cell types slingshot_df <- data.frame(colData(deng_SCE)) ggplot(slingshot_df, aes(x = slingPseudotime_1, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + theme_classic() + xlab("First Slingshot pseudotime") + ylab("cell type") + ggtitle("Cells ordered by Slingshot pseudotime")+scale_colour_manual(values = my_color) ggplot(slingshot_df, aes(x = slingPseudotime_2, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + theme_classic() + xlab("Second Slingshot pseudotime") + ylab("cell type") + ggtitle("Cells ordered by Slingshot pseudotime")+scale_colour_manual(values = my_color) ggplot(slingshot_df, aes(x = slingPseudotime_1, y = slingPseudotime_2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + theme_classic() + xlab("First Slingshot pseudotime") + ylab("Second Slingshot pseudotime") + ggtitle("Cells ordered by Slingshot pseudotime")+scale_colour_manual(values = my_color) # # ggplot(slingshot_df, aes(x = slingPseudotime_1, y = slingPseudotime_2, # colour = slingPseudotime_3)) + # geom_point() + theme_classic() + # xlab("First Slingshot pseudotime") + ylab("Second Slingshot pseudotime") + # ggtitle("Cells ordered by Slingshot pseudotime")+facet_wrap(.~cell_type2) Note You can also supply a start and an end cluster to slingshot. Comments Did you notice the ordering of clusters in the lineage prediced for 16cells state? There is an outlier-like cell in the 16cell group, find the outlier and remove it, then re-run Slingshot. 11.1.3 GAM general additive model for identifying temporally expressed genes After running slingshot, an interesting next step may be to find genes that change their expression over the course of development. We demonstrate one possible method for this type of analysis on the 100 most variable genes. We will regress each gene on the pseudotime variable we have generated, using a general additive model (GAM). This allows us to detect non-linear patterns in gene expression. library(gam) t <- deng_SCE$slingPseudotime_1 # for time, only look at the 100 most variable genes Y <- log1p(assay(deng_SCE,"logcounts")) var100 <- names(sort(apply(Y,1,var),decreasing = TRUE))[1:100] Y <- Y[var100,] # fit a GAM with a loess term for pseudotime gam.pval <- apply(Y,1,function(z){ d <- data.frame(z=z, t=t) suppressWarnings({ tmp <- gam(z ~ lo(t), data=d) }) p <- summary(tmp)[3][[1]][2,3] p }) ## Plot the top 100 genes' expression topgenes <- names(sort(gam.pval, decreasing = FALSE))[1:100] heatdata <- assays(deng_SCE)$logcounts[topgenes, order(t, na.last = NA)] heatclus <- deng_SCE$cell_type2[order(t, na.last = NA)] heatmap(heatdata, Colv = NA, ColSideColors = my_color[heatclus],cexRow = 1,cexCol = 1) We will regress each gene on the pseudotime variable we have generated, using a general additive model (GAM). This allows us to detect non-linear patterns in gene expression. 11.1.4 Monocle The original Monocle (Trapnell et al. 2014) method skips the clustering stage of TSCAN and directly builds a minimum spanning tree on a reduced dimension representation (using ‘ICA’) of the cells to connect all cells. Monocle then identifies the longest path in this tree as the main branch and uses this to determine pseudotime. Priors are required such as start/end state and the number of branching events. If the data contains diverging trajectories (i.e. one cell type differentiates into two different cell-types), monocle can identify these. Each of the resulting forked paths is defined as a separate cell state. 11.1.5 Monocle 2 Monocle 2 (Qiu et al. 2017) uses a different approach, with dimensionality reduction and ordering performed by reverse graph embedding (RGE), allowing it to detect branching events in an unsupervised manner. RGE, a machine-learning strategy, learns a ‘principal graph’ to describe the single-cell dataset. RGE also learns the mapping function of data points on the trajectory back to the original high dimentional space simutaneously. In doing so, it aims to position the latent points in the lower dimension space (along the trajectory) while also ensuring their corresponding positions in the input dimension are ‘neighbors’. There are different ways of implementing the RGE framework, Monocle 2 uses DDRTree(Discriminative dimensionality reduction via learning a tree) by default. DDRTree learns latent points and the projection of latent points to the points in original input space, which is equivalent to “dimension reductionâ€. In addition, it simutanously learns ‘principal graph’ for K-means soft clustered cetroids for the latent points. Principal graph is the spanning tree of those centroids. DDRTree returns a principal tree of the centroids of cell clusters in low dimension, pseudotime is derived for individual cells by calculating geomdestic distance of their projections onto the tree from the root (user-defined or arbitrarily assigned). Note Informally, a principal graph is like a principal curve which passes through the ‘middle’ of a data set but is allowed to have branches. library(monocle) #d <- deng_SCE[m3dGenes,] ## feature selection deng <- counts(deng_SCE) m3dGenes <- as.character( M3DropFeatureSelection(deng)$Gene ) d <- deng_SCE[which(rownames(deng_SCE) %in% m3dGenes), ] d <- d[!duplicated(rownames(d)), ] colnames(d) <- 1:ncol(d) geneNames <- rownames(d) rownames(d) <- 1:nrow(d) pd <- data.frame(timepoint = cellLabels) pd <- new("AnnotatedDataFrame", data=pd) fd <- data.frame(gene_short_name = geneNames) fd <- new("AnnotatedDataFrame", data=fd) dCellData <- newCellDataSet(counts(d), phenoData = pd, featureData = fd) # dCellData <- setOrderingFilter(dCellData, which(geneNames %in% m3dGenes)) dCellData <- estimateSizeFactors(dCellData) dCellDataSet <- reduceDimension(dCellData,reduction_method = "DDRTree", pseudo_expr = 1) dCellDataSet <- orderCells(dCellDataSet, reverse = FALSE) plot_cell_trajectory(dCellDataSet) # Store the ordering pseudotime_monocle2 <- data.frame( Timepoint = phenoData(dCellDataSet)$timepoint, pseudotime = phenoData(dCellDataSet)$Pseudotime, State = phenoData(dCellDataSet)$State ) rownames(pseudotime_monocle2) <- 1:ncol(d) pseudotime_order_monocle <- rownames(pseudotime_monocle2[order(pseudotime_monocle2$pseudotime), ]) Note check other available methods for ?reduceDimension We can again compare the inferred pseudotime to the known sampling timepoints. deng_SCE$pseudotime_monocle2 <- pseudotime_monocle2$pseudotime ggplot(as.data.frame(colData(deng_SCE)), aes(x = pseudotime_monocle2, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + scale_color_manual(values = my_color) + theme_classic() + xlab("monocle2 pseudotime") + ylab("Timepoint") + ggtitle("Cells ordered by monocle2 pseudotime") Monocle 2 performs pretty well on these cells. 11.1.6 Monocle 3 Monocle3(Cao et al. 2019) is the updated single-cell analysis toolkit for analysing large datasets. Monocle 3 is designed for use with absolute transcript counts (e.g. from UMI experiments). It first does dimension reduction with UMAP and then clusters the cells with Louvian/Leiden algorithms and merge adjacent groups into supergroup, and finaly resovles the trajectories individual cells can take during development, identifies the locations of branches and convergences within each supergroup. In short, Monocle3 uses UMAP to construct a initial trajectory inference and refines it with learning principal graph. It builds KNN graph in the UMAP dimensions and runs Louvain/Leiden algorithms om the KNN graph to derive communities; edges are drawn to connect communities that have more links (Partitioned Approximate Graph Abstraction (PAGA) graph). Each component of the PAGA grah is passed to the next step which is learning principal graph based on the SimplePPT algorithm. The pseudotime is calculated for individual cells by projecting the cells to their nearest point on the principal graph edge and measure geodesic distance along of principal points to the closest of their root nodes. library(monocle3) ## ## Attaching package: 'monocle3' ## The following objects are masked from 'package:monocle': ## ## plot_genes_in_pseudotime, plot_genes_violin, ## plot_pc_variance_explained ## The following objects are masked from 'package:Biobase': ## ## exprs, fData, fData<-, pData, pData<- gene_meta <- rowData(deng_SCE) #gene_metadata must contain a column verbatim named 'gene_short_name' for certain functions. gene_meta$gene_short_name <- rownames(gene_meta) cds <- new_cell_data_set(expression_data = counts(deng_SCE), cell_metadata = colData(deng_SCE), gene_metadata = gene_meta) ## Step 1: Normalize and pre-process the data cds <- preprocess_cds(cds,num_dim = 5) plot_pc_variance_explained(cds) ## Step 3: Reduce the dimensions using UMAP cds <- reduce_dimension(cds) ## No preprocess_method specified, using preprocess_method = 'PCA' ## Step 4: Cluster the cells cds <- cluster_cells(cds) ## change the clusters ## cds@clusters$UMAP$clusters <- deng_SCE$cell_type2 ## Step 5: Learn a graph cds <- learn_graph(cds,use_partition = TRUE) ## Step 6: Order cells cds <- order_cells(cds, root_cells = c("zy","zy.1","zy.2","zy.3") ) plot_cells(cds, color_cells_by="cell_type2", graph_label_size = 4, cell_size = 2, group_label_size = 6)+ scale_color_manual(values = my_color) plot_cells(cds, graph_label_size = 6, cell_size = 1, color_cells_by="pseudotime", group_label_size = 6) ## Cells aren't colored in a way that allows them to be grouped. pdata_cds <- pData(cds) pdata_cds$pseudotime_monocle3 <- monocle3::pseudotime(cds) ggplot(as.data.frame(pdata_cds), aes(x = pseudotime_monocle3, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + scale_color_manual(values = my_color) + theme_classic() + xlab("monocle3 pseudotime") + ylab("Timepoint") + ggtitle("Cells ordered by monocle3 pseudotime") deng_SCE$pseudotime_monocle3 <- pdata_cds$pseudotime_monocle3 It did not work well for our small Smart-seq2 dataset. 11.1.7 Diffusion maps Diffusion maps were introduced by Ronald Coifman and Stephane Lafon(Coifman and Lafon 2006), and the underlying idea is to assume that the data are samples from a diffusion process. The method infers the low-dimensional manifold by estimating the eigenvalues and eigenvectors for the diffusion operator related to the data. Angerer et al(Angerer et al. 2016) have applied the diffusion maps concept to the analysis of single-cell RNA-seq data to create an R package called destiny. We will take the ranko prder of cells in the first diffusion map component as “diffusion map pseudotime†here. deng <- logcounts(deng_SCE) colnames(deng) <- cellLabels dm <- DiffusionMap(t(deng)) tmp <- data.frame(DC1 = eigenvectors(dm)[,1], DC2 = eigenvectors(dm)[,2], Timepoint = deng_SCE$cell_type2) ggplot(tmp, aes(x = DC1, y = DC2, colour = Timepoint)) + geom_point() + scale_color_manual(values = my_color) + xlab("Diffusion component 1") + ylab("Diffusion component 2") + theme_classic() deng_SCE$pseudotime_diffusionmap <- rank(eigenvectors(dm)[,1]) ggplot(as.data.frame(colData(deng_SCE)), aes(x = pseudotime_diffusionmap, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + scale_color_manual(values = my_color) + theme_classic() + xlab("Diffusion map pseudotime (first diffusion map component)") + ylab("Timepoint") + ggtitle("Cells ordered by diffusion map pseudotime") Like the other methods, using the first diffusion map component from destiny as pseudotime does a good job at ordering the early time-points (if we take high values as “earlier†in developement), but it is unable to distinguish the later ones. Exercise 2 Do you get a better resolution between the later time points by considering additional eigenvectors? Exercise 3 How does the ordering change if you only use the genes identified by M3Drop? 11.1.8 Other methods 11.1.8.1 SLICER The SLICER(Welch, Hartemink, and Prins 2016) method is an algorithm for constructing trajectories that describe gene expression changes during a sequential biological process, just as Monocle and TSCAN are. SLICER is designed to capture highly nonlinear gene expression changes, automatically select genes related to the process, and detect multiple branch and loop features in the trajectory (Welch, Hartemink, and Prins 2016). The SLICER R package is available from its GitHub repository and can be installed from there using the devtools package. We use the select_genes function in SLICER to automatically select the genes to use in builing the cell trajectory. The function uses “neighbourhood variance†to identify genes that vary smoothly, rather than fluctuating randomly, across the set of cells. Following this, we determine which value of “k†(number of nearest neighbours) yields an embedding that most resembles a trajectory. Then we estimate the locally linear embedding of the cells. library("lle") slicer_genes <- select_genes(t(deng)) k <- select_k(t(deng[slicer_genes,]), kmin = 30, kmax=60) ## finding neighbours ## calculating weights ## computing coordinates ## finding neighbours ## calculating weights ## computing coordinates ## finding neighbours ## calculating weights ## computing coordinates ## finding neighbours ## calculating weights ## computing coordinates ## finding neighbours ## calculating weights ## computing coordinates ## finding neighbours ## calculating weights ## computing coordinates ## finding neighbours ## calculating weights ## computing coordinates slicer_traj_lle <- lle(t(deng[slicer_genes,]), m = 2, k)$Y ## finding neighbours ## calculating weights ## computing coordinates reducedDim(deng_SCE, "LLE") <- slicer_traj_lle plot_df <- data.frame(slicer1 = reducedDim(deng_SCE, "LLE")[,1], slicer2 = reducedDim(deng_SCE, "LLE")[,2], cell_type2 = deng_SCE$cell_type2) ggplot(data = plot_df)+geom_point(mapping = aes(x = slicer1, y = slicer2, color = cell_type2))+ scale_color_manual(values = my_color)+ xlab("LLE component 1") + ylab("LLE component 2") + ggtitle("Locally linear embedding of cells from SLICER")+ theme_classic() With the locally linear embedding computed we can construct a k-nearest neighbour graph that is fully connected. This plot displays a (yellow) circle for each cell, with the cell ID number overlaid in blue. Here we show the graph computed using 10 nearest neighbours. Here, SLICER appears to detect one major trajectory with one branch. slicer_traj_graph <- conn_knn_graph(slicer_traj_lle, 10) plot(slicer_traj_graph, main = "Fully connected kNN graph from SLICER") From this graph we can identify “extreme†cells that are candidates for start/end cells in the trajectory. ends <- find_extreme_cells(slicer_traj_graph, slicer_traj_lle) start <- ends[1] Having defined a start cell we can order the cells in the estimated pseudotime. pseudotime_order_slicer <- cell_order(slicer_traj_graph, start) branches <- assign_branches(slicer_traj_graph, start) pseudotime_slicer <- data.frame( Timepoint = cellLabels, pseudotime = NA, State = branches ) pseudotime_slicer$pseudotime[pseudotime_order_slicer] <- 1:length(pseudotime_order_slicer) deng_SCE$pseudotime_slicer <- pseudotime_slicer$pseudotime We can again compare the inferred pseudotime to the known sampling timepoints. SLICER does not provide a pseudotime value per se, just an ordering of cells. ggplot(as.data.frame(colData(deng_SCE)), aes(x = pseudotime_slicer, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + scale_color_manual(values = my_color) + theme_classic() + xlab("SLICER pseudotime (cell ordering)") + ylab("Timepoint") + theme_classic() Like the previous method, SLICER (Welch, Hartemink, and Prins 2016) here provides a good ordering for the early time points. It places “16cell†cells before “8cell†cells, but provides better ordering for blast cells than many of the earlier methods. Exercise 4 How do the results change for different k? (e.g. k = 5) What about changing the number of nearest neighbours in the call to conn_knn_graph? Exercise 5 How does the ordering change if you use a different set of genes from those chosen by SLICER (e.g. the genes identified by M3Drop)? 11.1.8.2 Ouija Ouija (http://kieranrcampbell.github.io/ouija/) takes a different approach from the pseudotime estimation methods we have looked at so far. Earlier methods have all been “unsupervisedâ€, which is to say that apart from perhaps selecting informative genes we do not supply the method with any prior information about how we expect certain genes or the trajectory as a whole to behave. Ouija, in contrast, is a probabilistic framework that allows for interpretable learning of single-cell pseudotimes using only small panels of marker genes. This method: infers pseudotimes from a small number of marker genes letting you understand why the pseudotimes have been learned in terms of those genes; provides parameter estimates (with uncertainty) for interpretable gene regulation behaviour (such as the peak time or the upregulation time); has a Bayesian hypothesis test to find genes regulated before others along the trajectory; identifies metastable states, ie discrete cell types along the continuous trajectory. We will supply the following marker genes to Ouija (with timepoints where they are expected to be highly expressed): Early timepoints: Dazl, Rnf17, Sycp3, Nanog, Pou5f1, Fgf8, Egfr, Bmp5, Bmp15 Mid timepoints: Zscan4b, Foxa1, Prdm14, Sox21 Late timepoints: Creb3, Gpx4, Krt8, Elf5, Eomes, Cdx2, Tdgf1, Gdf3 With Ouija we can model genes as either exhibiting monotonic up or down regulation (known as switch-like behaviour), or transient behaviour where the gene briefly peaks. By default, Ouija assumes all genes exhibit switch-like behaviour (the authors assure us not to worry if we get it wrong - the noise model means incorrectly specifying a transient gene as switch-like has minimal effect). Here we can “cheat†a little and check that our selected marker genes do actually identify different timepoints of the differentiation process. ouija_markers_down <- c("Dazl", "Rnf17", "Sycp3", "Fgf8", "Egfr", "Bmp5", "Bmp15", "Pou5f1") ouija_markers_up <- c("Creb3", "Gpx4", "Krt8", "Elf5", "Cdx2", "Tdgf1", "Gdf3", "Eomes") ouija_markers_transient <- c("Zscan4b", "Foxa1", "Prdm14", "Sox21") ouija_markers <- c(ouija_markers_down, ouija_markers_up, ouija_markers_transient) plotExpression(deng_SCE, ouija_markers, x = "cell_type2", colour_by = "cell_type2") + theme(axis.text.x = element_text(angle = 60, hjust = 1)) In order to fit the pseudotimes wesimply call ouija, passing in the expected response types. Note that if no response types are provided then they are all assumed to be switch-like by default, which we will do here. The input to Ouija can be a cell-by-gene matrix of non-negative expression values, or an ExpressionSet object, or, happily, by selecting the logcounts values from a SingleCellExperiment object. We can apply prior information about whether genes are up- or down-regulated across the differentiation process, and also provide prior information about when the switch in expression or a peak in expression is likely to occur. We can fit the Ouija model using either: Hamiltonian Monte Carlo (HMC) - full MCMC inference where gradient information of the log-posterior is used to “guide†the random walk through the parameter space, or Automatic Differentiation Variational Bayes (ADVI or simply VI) - approximate inference where the KL divergence to an approximate distribution is minimised. In general, HMC will provide more accurate inference with approximately correct posterior variance for all parameters. However, VB is orders of magnitude quicker than HMC and while it may underestimate posterior variance, the Ouija authors suggest that anecdotally it often performs as well as HMC for discovering posterior pseudotimes. To help the Ouija model, we provide it with prior information about the strength of switches for up- and down-regulated genes. By setting switch strength to -10 for down-regulated genes and 10 for up-regulated genes with a prior strength standard deviation of 0.5 we are telling the model that we are confident about the expected behaviour of these genes across the differentiation process. options(mc.cores = parallel::detectCores()) response_type <- c(rep("switch", length(ouija_markers_down) + length(ouija_markers_up)), rep("transient", length(ouija_markers_transient))) switch_strengths <- c(rep(-10, length(ouija_markers_down)), rep(10, length(ouija_markers_up))) switch_strength_sd <- c(rep(0.5, length(ouija_markers_down)), rep(0.5, length(ouija_markers_up))) garbage <- capture.output( oui_vb <- ouija(deng_SCE[ouija_markers,], single_cell_experiment_assay = "logcounts", response_type = response_type, switch_strengths = switch_strengths, switch_strength_sd = switch_strength_sd, inference_type = "vb") ) print(oui_vb) ## A Ouija fit with 268 cells and 20 marker genes ## Inference type: Variational Bayes ## (Gene behaviour) Switch/transient: 16 / 4 We can plot the gene expression over pseudotime along with the maximum a posteriori (MAP) estimates of the mean function (the sigmoid or Gaussian transient function) using the plot_expression function. plot_expression(oui_vb) We can also visualise when in the trajectory gene regulation behaviour occurs, either in the form of the switch time or the peak time (for switch-like or transient genes) using the plot_switch_times and plot_transient_times functions: plot_switch_times(oui_vb) plot_peak_times(oui_vb) Identify metastable states using consistency matrices. cmo <- consistency_matrix(oui_vb) plot_consistency(oui_vb) cell_classifications <- cluster_consistency(cmo) map_pst <- map_pseudotime(oui_vb) ouija_pseudotime <- data.frame(map_pst, cell_classifications) ggplot(ouija_pseudotime, aes(x = map_pst, y = cell_classifications)) + geom_point() + xlab("MAP pseudotime") + ylab("Cell classification") deng_SCE$pseudotime_ouija <- ouija_pseudotime$map_pst deng_SCE$ouija_cell_class <- ouija_pseudotime$cell_classifications ggplot(as.data.frame(colData(deng_SCE)), aes(x = pseudotime_ouija, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + scale_color_manual(values = my_color) + theme_classic() + xlab("Ouija pseudotime") + ylab("Timepoint") + theme_classic() Ouija does quite well in the ordering of the cells here, although it can be sensitive to the choice of marker genes and prior information supplied. How do the results change if you select different marker genes or change the priors? Ouija identifies four metastable states here, which we might annotate as “zygote/2cellâ€, “4/8/16 cellâ€, “blast1†and “blast2â€. ggplot(as.data.frame(colData(deng_SCE)), aes(x = as.factor(ouija_cell_class), y = pseudotime_ouija, colour = cell_type2)) + geom_boxplot() + coord_flip() + scale_color_manual(values = my_color) + theme_classic() + xlab("Ouija cell classification") + ylab("Ouija pseudotime") + theme_classic() A common analysis is to work out the regulation orderings of genes. For example, is gene A upregulated before gene B? Does gene C peak before the downregulation of gene D? Ouija answers these questions in terms of a Bayesian hypothesis test of whether the difference in regulation timing (either switch time or peak time) is significantly different to 0. This is collated using the gene_regulation function. gene_regs <- gene_regulation(oui_vb) head(gene_regs) ## # A tibble: 6 x 7 ## # Groups: label, gene_A [6] ## label gene_A gene_B mean_difference lower_95 upper_95 significant ## <chr> <chr> <chr> <dbl> <dbl> <dbl> <lgl> ## 1 Bmp15 - Cdx2 Bmp15 Cdx2 -0.0631 -0.109 -0.0133 TRUE ## 2 Bmp15 - Creb3 Bmp15 Creb3 0.269 0.201 0.321 TRUE ## 3 Bmp15 - Elf5 Bmp15 Elf5 -0.678 -0.718 -0.644 TRUE ## 4 Bmp15 - Eomes Bmp15 Eomes 0.0822 0.00272 0.156 TRUE ## 5 Bmp15 - Foxa1 Bmp15 Foxa1 -0.0211 -0.0508 0.0120 FALSE ## 6 Bmp15 - Gdf3 Bmp15 Gdf3 0.0644 0.0163 0.126 TRUE What conclusions can you draw from the gene regulation output from Ouija? If you have time, you might try the HMC inference method and see if that changes the Ouija results in any way. 11.1.9 Comparison of the methods How do the trajectories inferred by TSCAN, Monocle, Diffusion Map, SLICER and Ouija compare? TSCAN and Diffusion Map methods get the trajectory the “wrong way roundâ€, so we’ll adjust that for these comparisons. df_pseudotime <- as.data.frame( colData(deng_SCE)[, grep("pseudotime", colnames(colData(deng_SCE)))] ) colnames(df_pseudotime) <- gsub("pseudotime_", "", colnames(df_pseudotime)) df_pseudotime$PC1 <- reducedDim(deng_SCE,"PCA")[,1] df_pseudotime$order_tscan <- -df_pseudotime$order_tscan #df_pseudotime$diffusionmap <- df_pseudotime$diffusionmap df_pseudotime$slingshot1 <- colData(deng_SCE)$slingPseudotime_1 corrplot.mixed(cor(df_pseudotime, use = "na.or.complete"), order = "hclust", tl.col = "black", main = "Correlation matrix for pseudotime results", mar = c(0, 0, 3.1, 0)) We see here that Ouija, TSCAN and SLICER all give trajectories that are similar and strongly correlated with PC1. Diffusion Map is less strongly correlated with these methods, and Monocle gives very different results. 11.1.10 Expression of genes through time Each package also enables the visualization of expression through pseudotime. Following individual genes is very helpful for identifying genes that play an important role in the differentiation process. We illustrate the procedure using the Nanog gene. We have added the pseudotime values computed with all methods here to the colData slot of an SCE object. Having done that, the full plotting capabilities of the scater package can be used to investigate relationships between gene expression, cell populations and pseudotime. This is particularly useful for the packages such as SLICER that do not provide plotting functions. Principal components deng_SCE$PC1 <- reducedDim(deng_SCE,"PCA")[,1] plotExpression(deng_SCE, "Nanog", x = "PC1", colour_by = "cell_type2", show_violin = FALSE, show_smooth = TRUE) TSCAN plotExpression(deng_SCE, "Nanog", x = "pseudotime_order_tscan", colour_by = "cell_type2", show_violin = FALSE, show_smooth = TRUE) Monocle plotExpression(deng_SCE, "Nanog", x = "pseudotime_monocle2", colour_by = "cell_type2", show_violin = FALSE, show_smooth = TRUE) Diffusion Map plotExpression(deng_SCE, "Nanog", x = "pseudotime_diffusionmap", colour_by = "cell_type2", show_violin = FALSE, show_smooth = TRUE) SLICER plotExpression(deng_SCE, "Nanog", x = "pseudotime_slicer", colour_by = "cell_type2", show_violin = FALSE, show_smooth = TRUE) Ouija plotExpression(deng_SCE, "Nanog", x = "pseudotime_ouija", colour_by = "cell_type2", show_violin = FALSE, show_smooth = TRUE) How many of these methods outperform the naive approach of using the first principal component to represent pseudotime for these data? Exercise 7: Repeat the exercise using a subset of the genes, e.g. the set of highly variable genes that can be obtained using Brennecke_getVariableGenes() 11.1.11 dynverse https://dynverse.org/users/2-quick_start/ library(dyno) library(tidyverse) # Reproduces the guidelines as created in the shiny app answers <- dynguidelines::answer_questions( multiple_disconnected = FALSE, expect_topology = TRUE, expected_topology = "linear", n_cells = 3000, n_features = 10000, memory = "100GB", docker = FALSE ) guidelines <- dynguidelines::guidelines(answers = answers) guidelines deng_dataset <- wrap_expression( counts = counts(deng_SCE), expression = assay(deng_SCE,"logcounts") ) model <- infer_trajectory(deng_dataset, first(guidelines$methods_selected)) ## Loading required namespace: hdf5r model <- model %>% add_dimred(dyndimred::dimred_mds, expression_source = deng_dataset$expression) plot_dimred( model, expression_source = deng_dataset$expression, grouping = deng_SCE$cell_type2 ) 11.1.12 sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] splines parallel stats4 stats graphics grDevices utils ## [8] datasets methods base ## ## other attached packages: ## [1] rstan_2.19.2 StanHeaders_2.19.0 ## [3] lle_1.1 snowfall_1.84-6.1 ## [5] snow_0.4-3 MASS_7.3-51.1 ## [7] scatterplot3d_0.3-41 monocle3_0.2.0 ## [9] gam_1.16.1 foreach_1.4.7 ## [11] ouija_0.99.0 Rcpp_1.0.2 ## [13] SLICER_0.2.0 slingshot_1.2.0 ## [15] princurve_2.1.4 Polychrome_1.2.3 ## [17] corrplot_0.84 ggbeeswarm_0.6.0 ## [19] ggthemes_4.2.0 scater_1.12.2 ## [21] destiny_2.14.0 monocle_2.12.0 ## [23] DDRTree_0.1.5 irlba_2.3.3 ## [25] VGAM_1.1-1 ggplot2_3.2.1 ## [27] Matrix_1.2-17 M3Drop_1.10.0 ## [29] numDeriv_2016.8-1.1 TSCAN_1.22.0 ## [31] SingleCellExperiment_1.6.0 SummarizedExperiment_1.14.1 ## [33] DelayedArray_0.10.0 BiocParallel_1.18.1 ## [35] matrixStats_0.55.0 Biobase_2.44.0 ## [37] GenomicRanges_1.36.1 GenomeInfoDb_1.20.0 ## [39] IRanges_2.18.3 S4Vectors_0.22.1 ## [41] BiocGenerics_0.30.0 ## ## loaded via a namespace (and not attached): ## [1] rgl_0.100.30 rsvd_1.0.2 ## [3] vcd_1.4-4 Hmisc_4.2-0 ## [5] zinbwave_1.6.0 corpcor_1.6.9 ## [7] ps_1.3.0 class_7.3-15 ## [9] lmtest_0.9-37 glmnet_2.0-18 ## [11] crayon_1.3.4 laeken_0.5.0 ## [13] nlme_3.1-139 backports_1.1.4 ## [15] qlcMatrix_0.9.7 rlang_0.4.0 ## [17] XVector_0.24.0 readxl_1.3.1 ## [19] callr_3.3.2 limma_3.40.6 ## [21] phylobase_0.8.6 smoother_1.1 ## [23] manipulateWidget_0.10.0 bit64_0.9-7 ## [25] loo_2.1.0 glue_1.3.1 ## [27] pheatmap_1.0.12 rngtools_1.4 ## [29] splancs_2.01-40 processx_3.4.1 ## [31] vipor_0.4.5 AnnotationDbi_1.46.1 ## [33] haven_2.1.1 tidyselect_0.2.5 ## [35] rio_0.5.16 XML_3.98-1.20 ## [37] tidyr_1.0.0 zoo_1.8-6 ## [39] xtable_1.8-4 magrittr_1.5 ## [41] evaluate_0.14 bibtex_0.4.2 ## [43] cli_1.1.0 zlibbioc_1.30.0 ## [45] rstudioapi_0.10 miniUI_0.1.1.1 ## [47] sp_1.3-1 rpart_4.1-15 ## [49] locfdr_1.1-8 RcppEigen_0.3.3.5.0 ## [51] shiny_1.3.2 BiocSingular_1.0.0 ## [53] xfun_0.9 leidenbase_0.1.0 ## [55] inline_0.3.15 pkgbuild_1.0.5 ## [57] cluster_2.1.0 caTools_1.17.1.2 ## [59] sgeostat_1.0-27 tibble_2.1.3 ## [61] ggrepel_0.8.1 ape_5.3 ## [63] stabledist_0.7-1 zeallot_0.1.0 ## [65] withr_2.1.2 bitops_1.0-6 ## [67] slam_0.1-45 ranger_0.11.2 ## [69] plyr_1.8.4 cellranger_1.1.0 ## [71] pcaPP_1.9-73 sparsesvd_0.2 ## [73] coda_0.19-3 e1071_1.7-2 ## [75] RcppParallel_4.4.3 pillar_1.4.2 ## [77] gplots_3.0.1.1 reldist_1.6-6 ## [79] kernlab_0.9-27 TTR_0.23-5 ## [81] ellipsis_0.3.0 tripack_1.3-8 ## [83] DelayedMatrixStats_1.6.1 xts_0.11-2 ## [85] vctrs_0.2.0 NMF_0.21.0 ## [87] tools_3.6.0 foreign_0.8-70 ## [89] rncl_0.8.3 beeswarm_0.2.3 ## [91] munsell_0.5.0 proxy_0.4-23 ## [93] HSMMSingleCell_1.4.0 compiler_3.6.0 ## [95] abind_1.4-5 httpuv_1.5.2 ## [97] pkgmaker_0.27 GenomeInfoDbData_1.2.1 ## [99] gridExtra_2.3 edgeR_3.26.8 ## [101] lattice_0.20-38 deldir_0.1-23 ## [103] utf8_1.1.4 later_0.8.0 ## [105] dplyr_0.8.3 jsonlite_1.6 ## [107] scales_1.0.0 docopt_0.6.1 ## [109] carData_3.0-2 genefilter_1.66.0 ## [111] lazyeval_0.2.2 promises_1.0.1 ## [113] spatstat_1.61-0 car_3.0-3 ## [115] doParallel_1.0.15 latticeExtra_0.6-28 ## [117] R.utils_2.9.0 goftest_1.1-1 ## [119] spatstat.utils_1.13-0 checkmate_1.9.4 ## [121] cowplot_1.0.0 rmarkdown_1.15 ## [123] openxlsx_4.1.0.1 statmod_1.4.32 ## [125] webshot_0.5.1 Rtsne_0.15 ## [127] forcats_0.4.0 copula_0.999-19.1 ## [129] softImpute_1.4 uwot_0.1.4 ## [131] igraph_1.2.4.1 HDF5Array_1.12.2 ## [133] survival_2.43-3 yaml_2.2.0 ## [135] htmltools_0.3.6 memoise_1.1.0 ## [137] locfit_1.5-9.1 viridisLite_0.3.0 ## [139] digest_0.6.21 assertthat_0.2.1 ## [141] mime_0.7 densityClust_0.3 ## [143] registry_0.5-1 RSQLite_2.1.2 ## [145] data.table_1.12.2 blob_1.2.0 ## [147] R.oo_1.22.0 RNeXML_2.3.0 ## [149] labeling_0.3 fastICA_1.2-2 ## [151] Formula_1.2-3 Rhdf5lib_1.6.1 ## [153] RCurl_1.95-4.12 hms_0.5.1 ## [155] rhdf5_2.28.0 colorspace_1.4-1 ## [157] base64enc_0.1-3 nnet_7.3-12 ## [159] ADGofTest_0.3 mclust_5.4.5 ## [161] bookdown_0.13 RANN_2.6.1 ## [163] mvtnorm_1.0-11 fansi_0.4.0 ## [165] pspline_1.0-18 VIM_4.8.0 ## [167] R6_2.4.0 grid_3.6.0 ## [169] lifecycle_0.1.0 acepack_1.4.1 ## [171] zip_2.0.4 curl_4.2 ## [173] gdata_2.18.0 robustbase_0.93-5 ## [175] howmany_0.3-1 RcppAnnoy_0.0.13 ## [177] RColorBrewer_1.1-2 MCMCglmm_2.29 ## [179] iterators_1.0.12 alphahull_2.2 ## [181] stringr_1.4.0 htmlwidgets_1.3 ## [183] polyclip_1.10-0 purrr_0.3.2 ## [185] crosstalk_1.0.0 mgcv_1.8-28 ## [187] tensorA_0.36.1 htmlTable_1.13.2 ## [189] clusterExperiment_2.4.4 codetools_0.2-16 ## [191] FNN_1.1.3 gtools_3.8.1 ## [193] prettyunits_1.0.2 gridBase_0.4-7 ## [195] RSpectra_0.15-0 R.methodsS3_1.7.1 ## [197] gtable_0.3.0 DBI_1.0.0 ## [199] highr_0.8 tensor_1.5 ## [201] httr_1.4.1 KernSmooth_2.23-15 ## [203] stringi_1.4.3 progress_1.2.2 ## [205] reshape2_1.4.3 uuid_0.1-2 ## [207] cubature_2.0.3 annotate_1.62.0 ## [209] viridis_0.5.1 xml2_1.2.2 ## [211] combinat_0.0-8 bbmle_1.0.20 ## [213] boot_1.3-20 BiocNeighbors_1.2.0 ## [215] ade4_1.7-13 DEoptimR_1.0-8 ## [217] bit_1.1-14 spatstat.data_1.4-0 ## [219] pkgconfig_2.0.3 gsl_2.1-6 ## [221] knitr_1.25 References "], +["dechapter.html", "12 Differential Expression (DE) analysis 12.1 Introduction to DE analysis 12.2 DE in a real dataset", " 12 Differential Expression (DE) analysis 12.1 Introduction to DE analysis 12.1.1 Bulk RNA-seq One of the most common types of analyses when working with bulk RNA-seq data is to identify differentially expressed genes. By comparing the genes that change between two conditions, e.g. mutant and wild-type or stimulated and unstimulated, it is possible to characterize the molecular mechanisms underlying the change. Several different methods, e.g. DESeq2 and edgeR, have been developed for bulk RNA-seq. Moreover, there are also extensive datasets available where the RNA-seq data has been validated using RT-qPCR. These data can be used to benchmark DE finding algorithms and the available evidence suggests that the algorithms are performing quite well. 12.1.2 Single cell RNA-seq In contrast to bulk RNA-seq, in scRNA-seq we usually do not have a defined set of experimental conditions. Instead, as was shown in a previous chapter (10.2) we can identify the cell groups by using an unsupervised clustering approach. Once the groups have been identified one can find differentially expressed genes either by comparing the differences in variance between the groups (like the Kruskal-Wallis test implemented in SC3), or by comparing gene expression between clusters in a pairwise manner. In the following chapter we will mainly consider tools developed for pairwise comparisons. 12.1.3 Differences in Distribution Unlike bulk RNA-seq, we generally have a large number of samples (i.e. cells) for each group we are comparing in single-cell experiments. Thus we can take advantage of the whole distribution of expression values in each group to identify differences between groups rather than only comparing estimates of mean-expression as is standard for bulk RNASeq. There are two main approaches to comparing distributions. Firstly, we can use existing statistical models/distributions and fit the same type of model to the expression in each group then test for differences in the parameters for each model, or test whether the model fits better if a particular paramter is allowed to be different according to group. For instance in Chapter ?? we used edgeR to test whether allowing mean expression to be different in different batches significantly improved the fit of a negative binomial model of the data. Alternatively, we can use a non-parametric test which does not assume that expression values follow any particular distribution, e.g. the Kolmogorov-Smirnov test (KS-test). Non-parametric tests generally convert observed expression values to ranks and test whether the distribution of ranks for one group are signficantly different from the distribution of ranks for the other group. However, some non-parametric methods fail in the presence of a large number of tied values, such as the case for dropouts (zeros) in single-cell RNA-seq expression data. Moreover, if the conditions for a parametric test hold, then it will typically be more powerful than a non-parametric test. 12.1.4 Models of single-cell RNASeq data The most common model of RNASeq data is the negative binomial model: set.seed(1) hist( rnbinom( 1000, mu = 10, size = 100), col = "grey50", xlab = "Read Counts", main = "Negative Binomial" ) Figure 12.1: Negative Binomial distribution of read counts for a single gene across 1000 cells Mean: \\(\\mu = mu\\) Variance: \\(\\sigma^2 = mu + mu^2/size\\) It is parameterized by the mean expression (mu) and the dispersion (size), which is inversely related to the variance. The negative binomial model fits bulk RNA-seq data very well and it is used for most statistical methods designed for such data. In addition, it has been show to fit the distribution of molecule counts obtained from data tagged by unique molecular identifiers (UMIs) quite well (Grun et al. 2014, Islam et al. 2011). However, a raw negative binomial model does not fit full-length transcript data as well due to the high dropout rates relative to the non-zero read counts. For this type of data a variety of zero-inflated negative binomial models have been proposed (e.g. MAST, SCDE). d <- 0.5; counts <- rnbinom( 1000, mu = 10, size = 100 ) counts[runif(1000) < d] <- 0 hist( counts, col = "grey50", xlab = "Read Counts", main = "Zero-inflated NB" ) Figure 12.2: Zero-inflated Negative Binomial distribution Mean: \\(\\mu = mu \\cdot (1 - d)\\) Variance: \\(\\sigma^2 = \\mu \\cdot (1-d) \\cdot (1 + d \\cdot \\mu + \\mu / size)\\) These models introduce a new parameter \\(d\\), for the dropout rate, to the negative binomial model. As we saw in Chapter 19, the dropout rate of a gene is strongly correlated with the mean expression of the gene. Different zero-inflated negative binomial models use different relationships between mu and d and some may fit \\(\\mu\\) and \\(d\\) to the expression of each gene independently. Finally, several methods use a Poisson-Beta distribution which is based on a mechanistic model of transcriptional bursting. There is strong experimental support for this model (Kim and Marioni, 2013) and it provides a good fit to scRNA-seq data but it is less easy to use than the negative-binomial models and much less existing methods upon which to build than the negative binomial model. a <- 0.1 b <- 0.1 g <- 100 lambdas <- rbeta(1000, a, b) counts <- sapply(g*lambdas, function(l) {rpois(1, lambda = l)}) hist( counts, col = "grey50", xlab = "Read Counts", main = "Poisson-Beta" ) Mean: \\(\\mu = g \\cdot a / (a + b)\\) Variance: \\(\\sigma^2 = g^2 \\cdot a \\cdot b/((a + b + 1) \\cdot (a + b)^2)\\) This model uses three parameters: \\(a\\) the rate of activation of transcription; \\(b\\) the rate of inhibition of transcription; and \\(g\\) the rate of transcript production while transcription is active at the locus. Differential expression methods may test each of the parameters for differences across groups or only one (often \\(g\\)). All of these models may be further expanded to explicitly account for other sources of gene expression differences such as batch-effect or library depth depending on the particular DE algorithm. Exercise: Vary the parameters of each distribution to explore how they affect the distribution of gene expression. How similar are the Poisson-Beta and Negative Binomial models? 12.2 DE in a real dataset library(scRNA.seq.funcs) library(edgeR) library(monocle) library(MAST) library(ROCR) set.seed(1) 12.2.1 Introduction To test different single-cell differential expression methods we will be using the Blischak dataset from Chapters 7-17. For this experiment bulk RNA-seq data for each cell-line was generated in addition to single-cell data. We will use the differentially expressed genes identified using standard methods on the respective bulk data as the ground truth for evaluating the accuracy of each single-cell method. To save time we have pre-computed these for you. You can run the commands below to load these data. DE <- read.table("data/tung/TPs.txt") notDE <- read.table("data/tung/TNs.txt") GroundTruth <- list( DE = as.character(unlist(DE)), notDE = as.character(unlist(notDE)) ) This ground truth has been produce for the comparison of individual NA19101 to NA19239. Now load the respective single-cell data: molecules <- read.table("data/tung/molecules.txt", sep = "\\t") anno <- read.table("data/tung/annotation.txt", sep = "\\t", header = TRUE) keep <- anno[,1] == "NA19101" | anno[,1] == "NA19239" data <- molecules[,keep] group <- anno[keep,1] batch <- anno[keep,4] # remove genes that aren't expressed in at least 6 cells gkeep <- rowSums(data > 0) > 5; counts <- data[gkeep,] # Library size normalization lib_size = colSums(counts) norm <- t(t(counts)/lib_size * median(lib_size)) # Variant of CPM for datasets with library sizes of fewer than 1 mil molecules Now we will compare various single-cell DE methods. We will focus on methods that performed well in Soneson and Robinson’s [2019; CITE] detailed comparison of differential expression methods for single-cell data. Note that we will only be running methods which are available as R-packages and run relatively quickly. 12.2.2 Kolmogorov-Smirnov test The types of test that are easiest to work with are non-parametric ones. The most commonly used non-parametric test is the Kolmogorov-Smirnov test (KS-test) and we can use it to compare the distributions for each gene in the two individuals. The KS-test quantifies the distance between the empirical cummulative distributions of the expression of each gene in each of the two populations. It is sensitive to changes in mean experession and changes in variability. However it assumes data is continuous and may perform poorly when data contains a large number of identical values (eg. zeros). Another issue with the KS-test is that it can be very sensitive for large sample sizes and thus it may end up as significant even though the magnitude of the difference is very small. Now run the test: pVals <- apply( norm, 1, function(x) { ks.test( x[group == "NA19101"], x[group == "NA19239"] )$p.value } ) # multiple testing correction pVals <- p.adjust(pVals, method = "fdr") This code “applies†the function to each row (specified by 1) of the expression matrix, data. In the function we are returning just the p.value from the ks.test output. We can now consider how many of the ground truth positive and negative DE genes are detected by the KS-test: 12.2.2.1 Evaluating Accuracy sigDE <- names(pVals)[pVals < 0.05] length(sigDE) # Number of KS-DE genes sum(GroundTruth$DE %in% sigDE) # Number of KS-DE genes that are true DE genes sum(GroundTruth$notDE %in% sigDE) # Number of KS-DE genes that are truly not-DE As you can see many more of our ground truth negative genes were identified as DE by the KS-test (false positives) than ground truth positive genes (true positives), however this may be due to the larger number of notDE genes thus we typically normalize these counts as the True positive rate (TPR), TP/(TP + FN), and False positive rate (FPR), FP/(FP+TP). tp <- sum(GroundTruth$DE %in% sigDE) fp <- sum(GroundTruth$notDE %in% sigDE) tn <- sum(GroundTruth$notDE %in% names(pVals)[pVals >= 0.05]) fn <- sum(GroundTruth$DE %in% names(pVals)[pVals >= 0.05]) tpr <- tp/(tp + fn) fpr <- fp/(fp + tn) cat(c(tpr, fpr)) Now we can see the TPR is much higher than the FPR indicating the KS test is identifying DE genes. So far we’ve only evaluated the performance at a single significance threshold. Often it is informative to vary the threshold and evaluate performance across a range of values. This is then plotted as a receiver-operating-characteristic curve (ROC) and a general accuracy statistic can be calculated as the area under this curve (AUC). We will use the ROCR package to facilitate this plotting. # Only consider genes for which we know the ground truth pVals <- pVals[names(pVals) %in% GroundTruth$DE | names(pVals) %in% GroundTruth$notDE] truth <- rep(1, times = length(pVals)); truth[names(pVals) %in% GroundTruth$DE] = 0; pred <- ROCR::prediction(pVals, truth) perf <- ROCR::performance(pred, "tpr", "fpr") ROCR::plot(perf) aucObj <- ROCR::performance(pred, "auc") aucObj@y.values[[1]] # AUC Finally to facilitate the comparisons of other DE methods let’s put this code into a function so we don’t need to repeat it: DE_Quality_AUC <- function(pVals) { pVals <- pVals[names(pVals) %in% GroundTruth$DE | names(pVals) %in% GroundTruth$notDE] truth <- rep(1, times = length(pVals)); truth[names(pVals) %in% GroundTruth$DE] = 0; pred <- ROCR::prediction(pVals, truth) perf <- ROCR::performance(pred, "tpr", "fpr") ROCR::plot(perf) aucObj <- ROCR::performance(pred, "auc") return(aucObj@y.values[[1]]) } 12.2.3 Wilcox/Mann-Whitney-U Test The Wilcox-rank-sum test is another non-parametric test, but tests specifically if values in one group are greater/less than the values in the other group. Thus it is often considered a test for difference in median expression between two groups; whereas the KS-test is sensitive to any change in distribution of expression values. pVals <- apply( norm, 1, function(x) { wilcox.test( x[group == "NA19101"], x[group == "NA19239"] )$p.value } ) # multiple testing correction pVals <- p.adjust(pVals, method = "fdr") DE_Quality_AUC(pVals) 12.2.4 edgeR We’ve already used edgeR for differential expression in Chapter ??. edgeR is based on a negative binomial model of gene expression and uses a generalized linear model (GLM) framework, the enables us to include other factors such as batch to the model. dge <- DGEList( counts = counts, norm.factors = rep(1, length(counts[1,])), group = group ) group_edgeR <- factor(group) design <- model.matrix(~ group_edgeR) dge <- estimateDisp(dge, design = design, trend.method = "none") fit <- glmFit(dge, design) res <- glmLRT(fit) pVals <- res$table[,4] names(pVals) <- rownames(res$table) pVals <- p.adjust(pVals, method = "fdr") DE_Quality_AUC(pVals) 12.2.5 MAST MAST is based on a zero-inflated negative binomial model. It tests for differential expression using a hurdle model to combine tests of discrete (0 vs not zero) and continuous (non-zero values) aspects of gene expression. Again this uses a linear modelling framework to enable complex models to be considered. log_counts <- log(counts + 1) / log(2) fData <- data.frame(names = rownames(log_counts)) rownames(fData) <- rownames(log_counts); cData <- data.frame(cond = group) rownames(cData) <- colnames(log_counts) obj <- FromMatrix(as.matrix(log_counts), cData, fData) colData(obj)$cngeneson <- scale(colSums(assay(obj) > 0)) cond <- factor(colData(obj)$cond) # Model expression as function of condition & number of detected genes zlmCond <- zlm.SingleCellAssay(~ cond + cngeneson, obj) summaryCond <- summary(zlmCond, doLRT = "condNA19101") summaryDt <- summaryCond$datatable summaryDt <- as.data.frame(summaryDt) pVals <- unlist(summaryDt[summaryDt$component == "H",4]) # H = hurdle model names(pVals) <- unlist(summaryDt[summaryDt$component == "H",1]) pVals <- p.adjust(pVals, method = "fdr") DE_Quality_AUC(pVals) 12.2.6 limma 12.2.7 Pseudobulk 12.2.8 sessionInfo() "], ["imputation.html", "13 Imputation", " 13 Imputation library(scImpute) library(SC3) library(scater) library(SingleCellExperiment) library(mclust) library(DrImpute) set.seed(1234567) As discussed previously, one of the main challenges when analyzing scRNA-seq data is the presence of zeros, or dropouts. The dropouts are assumed to have arisen for three possible reasons: The gene was not expressed in the cell and hence there are no transcripts to sequence The gene was expressed, but for some reason the transcripts were lost somewhere prior to sequencing The gene was expressed and transcripts were captured and turned into cDNA, but the sequencing depth was not sufficient to produce any reads. Thus, dropouts could be result of experimental shortcomings, and if this is the case then we would like to provide computational corrections. One possible solution is to impute the dropouts in the expression matrix. To be able to impute gene expression values, one must have an underlying model. However, since we do not know which dropout events are technical artefacts and which correspond to the transcript being truly absent, imputation is a difficult challenge and prone to creating false-positive results in downstream analysis. There are many different imputation methods available we will consider three fast, published methods: DrImpute and scImpute (Li and Li 2017). DrImpute and scImpute both use a model to determine which zeros are technical and impute only those values. Both use clustering to identify a group of cells that are assumed to have homogenous expression. DrImpute imputes all values that are not consistently zero in all cells of a cluster. Whereas, scImpute uses a zero-inflated normal distribution fit to log-normalized expression values and imputed all inflated zeros. 13.0.1 scImpute To test scImpute, we use the default parameters and we apply it to the Deng dataset that we have worked with before. scImpute takes a .csv or .txt file as an input: deng <- readRDS("data/deng/deng-reads.rds") write.csv(counts(deng), "deng.csv") scimpute( count_path = "deng.csv", infile = "csv", outfile = "txt", out_dir = "./", Kcluster = 10, ncores = 2 ) Now we can compare the results with original data by considering a PCA plot res <- read.table("scimpute_count.txt") colnames(res) <- NULL res <- SingleCellExperiment( assays = list(logcounts = log2(as.matrix(res) + 1)), colData = colData(deng) ) rowData(res)$feature_symbol <- rowData(deng)$feature_symbol plotPCA( res, colour_by = "cell_type2" ) Compare this result to the original data in Chapter 10.2. What are the most significant differences? We can examine the expression of specific genes to directly see the effect of imputation on the expression distribution. plotExpression(res, c("Sox2", "Eomes", "Zscan4d", "Fgf4")) plotExpression(deng, c("Sox2", "Eomes", "Zscan4d", "Fgf4")) To evaluate the impact of the imputation, we use SC3 to cluster the imputed matrix res <- sc3_estimate_k(res) metadata(res)$sc3$k_estimation res <- sc3(res, ks = 10, n_cores = 1, gene_filter = FALSE) adjustedRandIndex(colData(deng)$cell_type2, colData(res)$sc3_10_clusters) plotPCA( res, colour_by = "sc3_10_clusters" ) Exercise: Based on the PCA and the clustering results, do you think that imputation using scImpute is a good idea for the Deng dataset? 13.0.2 DrImpute We can do the same for DrImpute. DrImpute runs on a log-normalized expression matrix directly in R, we generate this matrix using scater, then run DrImpute. Unlike scImpute, DrImpute considers the consensus imputation across a range of ks using two differ correlation distances: deng <- normalize(deng) res <- DrImpute(deng@assays[["logcounts"]], ks=8:12) colnames(res) <- colnames(deng) rownames(res) <- rownames(deng) res <- SingleCellExperiment( assays = list(logcounts = as.matrix(res)), colData = colData(deng) ) rowData(res)$feature_symbol <- rowData(deng)$feature_symbol plotPCA( res, colour_by = "cell_type2" ) plotExpression(res, c("Sox2", "Eomes", "Zscan4d", "Fgf4")) Exercise: Check the sc3 clustering of the DrImpute matrix, do you think that imputation using DrImpute is a good idea for the Deng dataset? Exercise: What is the difference between scImpute and DrImpute based on the PCA and clustering analysis? Which one do you think is best to use? 13.0.3 sessionInfo() References "], ["comparing-and-combining-scrna-seq-datasets.html", "14 Comparing and combining scRNA-seq datasets 14.1 Search scRNA-Seq data", " 14 Comparing and combining scRNA-seq datasets library(scater) library(SingleCellExperiment) 14.0.1 Introduction As more and more scRNA-seq datasets become available, carrying merged_seurat comparisons between them is key. There are two main approaches to comparing scRNASeq datasets. The first approach is “label-centric†which is focused on trying to identify equivalent cell-types/states across datasets by comparing individual cells or groups of cells. The other approach is “cross-dataset normalization†which attempts to computationally remove experiment-specific technical/biological effects so that data from multiple experiments can be combined and jointly analyzed. The label-centric approach can be used with dataset with high-confidence cell-annotations, e.g. the Human Cell Atlas (HCA) (Regev et al. 2017) or the Tabula Muris (???) once they are completed, to project cells or clusters from a new sample onto this reference to consider tissue composition and/or identify cells with novel/unknown identity. Conceptually, such projections are similar to the popular BLAST method (Altschul et al. 1990), which makes it possible to quickly find the closest match in a database for a newly identified nucleotide or amino acid sequence. The label-centric approach can also be used to compare datasets of similar biological origin collected by different labs to ensure that the annotation and the analysis is consistent. The cross-dataset normalization approach can also be used to compare datasets of similar biological origin, unlike the label-centric approach it enables the join analysis of multiple datasets to facilitate the identification of rare cell-types which may to too sparsely sampled in each individual dataset to be reliably detected. However, cross-dataset normalization is not applicable to very large and diverse references since it assumes a significant portion of the biological variablility in each of the datasets overlaps with others. 14.0.2 Datasets We will running these methods on two human pancreas datasets: (Muraro et al. 2016) and (Segerstolpe et al. 2016). Since the pancreas has been widely studied, these datasets are well annotated. muraro <- readRDS("data/pancreas/muraro.rds") segerstolpe <- readRDS("data/pancreas/segerstolpe.rds") This data has already been formatted for scmap. Cell type labels must be stored in the cell_type1 column of the colData slots, and gene ids that are consistent across both datasets must be stored in the feature_symbol column of the rowData slots. First, lets check our gene-ids match across both datasets: sum(rowData(muraro)$feature_symbol %in% rowData(segerstolpe)$feature_symbol)/nrow(muraro) sum(rowData(segerstolpe)$feature_symbol %in% rowData(muraro)$feature_symbol)/nrow(segerstolpe) Here we can see that 96% of the genes present in muraro match genes in segerstople and 72% of genes in segerstolpe are match genes in muraro. This is as expected because the segerstolpe dataset was more deeply sequenced than the muraro dataset. However, it highlights some of the difficulties in comparing scRNASeq datasets. We can confirm this by checking the overall size of these two datasets. dim(muraro) dim(segerstolpe) In addition, we can check the cell-type annotations for each of these dataset using the command below: summary(factor(colData(muraro)$cell_type1)) summary(factor(colData(segerstolpe)$cell_type1)) Here we can see that even though both datasets considered the same biological tissue the two datasets, they have been annotated with slightly different sets of cell-types. If you are familiar withpancreas biology you might recognize that the pancreatic stellate cells (PSCs) in segerstolpe are a type of mesenchymal stem cell which would fall under the “mesenchymal†type in muraro. However, it isn’t clear whether these two annotations should be considered synonymous or not. We can use label-centric comparison methods to determine if these two cell-type annotations are indeed equivalent. Alternatively, we might be interested in understanding the function of those cells that were “unclassified endocrine†or were deemed too poor quality (“not applicableâ€) for the original clustering in each dataset by leveraging in formation across datasets. Either we could attempt to infer which of the existing annotations they most likely belong to using label-centric approaches or we could try to uncover a novel cell-type among them (or a sub-type within the existing annotations) using cross-dataset normalization. To simplify our demonstration analyses we will remove the small classes of unassigned cells, and the poor quality cells. We will retain the “unclassified endocrine†to see if any of these methods can elucidate what cell-type they belong to. segerstolpe <- segerstolpe[,colData(segerstolpe)$cell_type1 != "unclassified"] segerstolpe <- segerstolpe[,colData(segerstolpe)$cell_type1 != "not applicable",] muraro <- muraro[,colData(muraro)$cell_type1 != "unclear"] 14.0.3 Projecting cells onto annotated cell-types (scmap) library(scmap) set.seed(1234567) We recently developed scmap (Kiselev and Hemberg 2017) - a method for projecting cells from a scRNA-seq experiment onto the cell-types identified in other experiments. Additionally, a cloud version of scmap can be run for free, withmerged_seurat restrictions, from http://www.hemberg-lab.cloud/scmap. 14.0.3.1 Feature Selection Once we have a SingleCellExperiment object we can run scmap. First we have to build the “index†of our reference clusters. Since we want to know whether PSCs and mesenchymal cells are synonymous we will project each dataset to the other so we will build an index for each dataset. This requires first selecting the most informative features for the reference dataset. muraro <- selectFeatures(muraro, suppress_plot = FALSE) Genes highlighted with the red colour will be used in the futher analysis (projection). segerstolpe <- selectFeatures(segerstolpe, suppress_plot = FALSE) From the y-axis of these plots we can see that scmap uses a dropmerged_seurat-based feature selection method. Now calculate the cell-type index: muraro <- indexCluster(muraro) segerstolpe <- indexCluster(segerstolpe) We can also visualize the index: heatmap(as.matrix(metadata(muraro)$scmap_cluster_index)) You may want to adjust your features using the setFeatures function if features are too heavily concentrated in only a few cell-types. In this case the dropmerged_seurat-based features look good so we will just them. Exercise Using the rowData of each dataset how many genes were selected as features in both datasets? What does this tell you abmerged_seurat these datasets? Answer 14.0.3.2 Projecting scmap computes the distance from each cell to each cell-type in the reference index, then applies an empirically derived threshold to determine which cells are assigned to the closest reference cell-type and which are unassigned. To account for differences in sequencing depth distance is calculated using the spearman correlation and cosine distance and only cells with a consistent assignment with both distances are returned as assigned. We will project the segerstolpe dataset to muraro dataset: seger_to_muraro <- scmapCluster( projection = segerstolpe, index_list = list( muraro = metadata(muraro)$scmap_cluster_index ) ) and muraro onto segerstolpe muraro_to_seger <- scmapCluster( projection = muraro, index_list = list( seger = metadata(segerstolpe)$scmap_cluster_index ) ) Note that in each case we are projecting to a single dataset but that this could be extended to any number of datasets for which we have computed indices. Now lets compare the original cell-type labels with the projected labels: table(colData(muraro)$cell_type1, muraro_to_seger$scmap_cluster_labs) Here we can see that cell-types do map to their equivalents in segerstolpe, and importantly we see that all but one of the “mesenchymal†cells were assigned to the “PSC†class. table(colData(segerstolpe)$cell_type1, seger_to_muraro$scmap_cluster_labs) Again we see cell-types match each other and that all but one of the “PSCs†match the “mesenchymal†cells providing strong evidence that these two annotations should be considered synonymous. We can also visualize these tables using a Sankey diagram: plot(getSankey(colData(muraro)$cell_type1, muraro_to_seger$scmap_cluster_labs[,1], plot_height=400)) Exercise How many of the previously unclassified cells would be be able to assign to cell-types using scmap? Answer 14.0.4 Cell-to-Cell mapping scmap can also project each cell in one dataset to its approximate closest neighbouring cell in the reference dataset. This uses a highly optimized search algorithm allowing it to be scaled to very large references (in theory 100,000-millions of cells). However, this process is stochastic so we must fix the random seed to ensure we can reproduce our results. We have already performed feature selection for this dataset so we can go straight to building the index. set.seed(193047) segerstolpe <- indexCell(segerstolpe) muraro <- indexCell(muraro) In this case the index is a series of clusterings of each cell using different sets of features, parameters k and M are the number of clusters and the number of features used in each of these subclusterings. New cells are assigned to the nearest cluster in each subclustering to generate unique pattern of cluster assignments. We then find the cell in the reference dataset with the same or most similar pattern of cluster assignments. We can examine the cluster assignment patterns for the reference datasets using: metadata(muraro)$scmap_cell_index$subclusters[1:5,1:5] To project and find the w nearest neighbours we use a similar command as before: muraro_to_seger <- scmapCell( projection = muraro, index_list = list( seger = metadata(segerstolpe)$scmap_cell_index ), w = 5 ) We can again look at the results: muraro_to_seger$seger[[1]][,1:5] This shows the column number of the 5 nearest neighbours in segerstolpe to each of the cells in muraro. We could then calculate a pseudotime estimate, branch assignment, or other cell-level data by selecting the appropriate data from the colData of the segerstolpe data set. As a demonstration we will find the cell-type of the nearest neighbour of each cell. cell_type_NN <- colData(segerstolpe)$cell_type1[muraro_to_seger$seger[[1]][1,]] head(cell_type_NN) 14.0.5 Metaneighbour Metaneighbour is specifically designed to ask whether cell-type labels are consistent across datasets. It comes in two versions. First is a fully supervised method which assumes cell-types are known in all datasets and calculates how “good†those cell-type labels are. (The precise meaning of “good†will be described below). Alternatively, metaneighbour can estimate how similar all cell-types are to each other both within and across datasets. We will only be using the unsupervised version as it has much more general applicability and is easier to interpret the results of. Metaneighbour compares cell-types across datasets by building a cell-cell spearman correlation network. The method then tries to predict the label of each cell through weighted “votes†of its nearest-neighbours. Then scores the overall similarity between two clusters as the AUROC for assigning cells of typeA to typeB based on these weighted votes. AUROC of 1 would indicate all the cells of typeA were assigned to typeB before any other cells were, and an AUROC of 0.5 is what you would get if cells were being randomly assigned. Metanighbour is just a couple of R functions not a complete package so we have to load them using source source("course_files/utils/2017-08-28-runMN-US.R") 14.0.5.1 Prepare Data Metaneighbour requires all datasets to be combined into a single expression matrix prior to running: is.common <- rowData(muraro)$feature_symbol %in% rowData(segerstolpe)$feature_symbol muraro <- muraro[is.common,] segerstolpe <- segerstolpe[match(rowData(muraro)$feature_symbol, rowData(segerstolpe)$feature_symbol),] rownames(segerstolpe) <- rowData(segerstolpe)$feature_symbol rownames(muraro) <- rowData(muraro)$feature_symbol identical(rownames(segerstolpe), rownames(muraro)) combined_logcounts <- cbind(logcounts(muraro), logcounts(segerstolpe)) dataset_labels <- rep(c("m", "s"), times=c(ncol(muraro), ncol(segerstolpe))) cell_type_labels <- c(colData(muraro)$cell_type1, colData(segerstolpe)$cell_type1) pheno <- data.frame(Sample_ID = colnames(combined_logcounts), Study_ID=dataset_labels, Celltype=paste(cell_type_labels, dataset_labels, sep="-")) rownames(pheno) <- colnames(combined_logcounts) Metaneighbor includes a feature selection method to identify highly variable genes. var.genes = get_variable_genes(combined_logcounts, pheno) Since Metaneighbor is much slower than scmap, we will down sample these datasets. subset <- sample(1:nrow(pheno), 2000) combined_logcounts <- combined_logcounts[,subset] pheno <- pheno[subset,] cell_type_labels <- cell_type_labels[subset] dataset_labels <- dataset_labels[subset] Now we are ready to run Metaneighbor. First we will run the unsupervised version that will let us see which cell-types are most similar across the two datasets. unsup <- run_MetaNeighbor_US(var.genes, combined_logcounts, unique(pheno$Celltype), pheno) heatmap(unsup) 14.0.6 mnnCorrect mnnCorrect corrects datasets to facilitate joint analysis. It order to account for differences in composition between two replicates or two different experiments it first matches invidual cells across experiments to find the overlaping biologicial structure. Using that overlap it learns which dimensions of expression correspond to the biological state and which dimensions correspond to batch/experiment effect; mnnCorrect assumes these dimensions are orthologal to each other in high dimensional expression space. Finally it removes the batch/experiment effects from the entire expression matrix to return the corrected matrix. To match individual cells to each other across datasets, mnnCorrect uses the cosine distance to avoid library-size effect then identifies mututal nearest neighbours (k determines to neighbourhood size) across datasets. Only overlaping biological groups should have mutual nearest neighbours (see panel b below). However, this assumes that k is set to approximately the size of the smallest biological group in the datasets, but a k that is too low will identify too few mutual nearest-neighbour pairs to get a good estimate of the batch effect we want to remove. Learning the biological/techncial effects is done with either singular value decomposition, similar to RUV we encounters in the batch-correction section, or with principal component analysis with the opitimized irlba package, which should be faster than SVD. The parameter svd.dim specifies how many dimensions should be kept to summarize the biological structure of the data, we will set it to three as we found three major groups using Metaneighbor above. These estimates may be futher adjusted by smoothing (sigma) and/or variance adjustment (var.adj). mnnCorrect also assumes you’ve already subset your expression matricies so that they contain identical genes in the same order, fortunately we have already done with for our datasets when we set up our data for Metaneighbor. require("batchelor") # mnnCorrect will take several minutes to run corrected <- mnnCorrect(logcounts(muraro), logcounts(segerstolpe), k=20, sigma=1, pc.approx=TRUE, subset.row=var.genes, svd.dim=3) First let’s check that we found a sufficient number of mnn pairs, mnnCorrect returns a list of dataframe with the mnn pairs for each dataset. dim(corrected$pairs[[1]]) # muraro -> others dim(corrected$pairs[[2]]) # seger -> others The first and second columns contain the cell column IDs and the third column contains a number indicating which dataset/batch the column 2 cell belongs to. In our case, we are only comparing two datasets so all the mnn pairs have been assigned to the second table and the third column contains only ones head(corrected$pairs[[2]]) total_pairs <- nrow(corrected$pairs[[2]]) n_unique_seger <- length(unique((corrected$pairs[[2]][,1]))) n_unique_muraro <- length(unique((corrected$pairs[[2]][,2]))) mnnCorrect found “r total_pairs†sets of mutual nearest-neighbours between n_unique_seger segerstolpe cells and n_unique_muraro muraro cells. This should be a sufficient number of pairs but the low number of unique cells in each dataset suggests we might not have captured the full biological signal in each dataset. Exercise Which cell-types had mnns across these datasets? Should we increase/decrease k? Answer Now we could create a combined dataset to jointly analyse these data. However, the corrected data is no longer counts and usually will contain negative expression values thus some analysis tools may no longer be appropriate. For simplicity let’s just plot a joint TSNE. require("Rtsne") joint_expression_matrix <- cbind(corrected$corrected[[1]], corrected$corrected[[2]]) # Tsne will take some time to run on the full dataset joint_tsne <- Rtsne(t(joint_expression_matrix[rownames(joint_expression_matrix) %in% var.genes,]), initial_dims=10, theta=0.75, check_duplicates=FALSE, max_iter=200, stop_lying_iter=50, mom_switch_iter=50) dataset_labels <- factor(rep(c("m", "s"), times=c(ncol(muraro), ncol(segerstolpe)))) cell_type_labels <- factor(c(colData(muraro)$cell_type1, colData(segerstolpe)$cell_type1)) plot(joint_tsne$Y[,1], joint_tsne$Y[,2], pch=c(16,1)[dataset_labels], col=rainbow(length(levels(cell_type_labels)))[cell_type_labels]) 14.0.7 Cannonical Correlation Analysis (Seurat) The Seurat package contains another correction method for combining multiple datasets, called CCA. However, unlike mnnCorrect it doesn’t correct the expression matrix itself directly. Instead Seurat finds a lower dimensional subspace for each dataset then corrects these subspaces. Also different from mnnCorrect, Seurat only combines a single pair of datasets at a time. Seurat uses gene-gene correlations to identify the biological structure in the dataset with a method called canonical correlation analysis (CCA). Seurat learns the shared structure to the gene-gene correlations and then evaluates how well each cell fits this structure. Cells which must better described by a data-specific dimensionality reduction method than by the shared correlation structure are assumed to represent dataset-specific cell-types/states and are discarded before aligning the two datasets. Finally the two datasets are aligned using ‘warping’ algorithms which normalize the low-dimensional representations of each dataset in a way that is robust to differences in population density. Note because Seurat uses up a lot of library space you will have to restart your R-session to load it, and the plots/output won’t be automatically generated on this page. Reload the data: muraro <- readRDS("data/pancreas/muraro.rds") segerstolpe <- readRDS("data/pancreas/segerstolpe.rds") segerstolpe <- segerstolpe[,colData(segerstolpe)$cell_type1 != "unclassified"] segerstolpe <- segerstolpe[,colData(segerstolpe)$cell_type1 != "not applicable",] muraro <- muraro[,colData(muraro)$cell_type1 != "unclear"] is.common <- rowData(muraro)$feature_symbol %in% rowData(segerstolpe)$feature_symbol muraro <- muraro[is.common,] segerstolpe <- segerstolpe[match(rowData(muraro)$feature_symbol, rowData(segerstolpe)$feature_symbol),] rownames(segerstolpe) <- rowData(segerstolpe)$feature_symbol rownames(muraro) <- rowData(muraro)$feature_symbol identical(rownames(segerstolpe), rownames(muraro)) First we will reformat our data into Seurat objects: require("Seurat") set.seed(4719364) muraro_seurat <- CreateSeuratObject(raw.data=assays(muraro)[["normcounts"]]) # raw counts aren't available for muraro muraro_seurat@meta.data[, "dataset"] <- 1 muraro_seurat@meta.data[, "celltype"] <- paste("m",colData(muraro)$cell_type1, sep="-") seger_seurat <- CreateSeuratObject(raw.data=assays(segerstolpe)[["counts"]]) seger_seurat@meta.data[, "dataset"] <- 2 seger_seurat@meta.data[, "celltype"] <- paste("s",colData(segerstolpe)$cell_type1, sep="-") Next we must normalize, scale and identify highly variable genes for each dataset: muraro_seurat <- NormalizeData(object=muraro_seurat) muraro_seurat <- ScaleData(object=muraro_seurat) muraro_seurat <- FindVariableGenes(object=muraro_seurat, do.plot=TRUE) seger_seurat <- NormalizeData(object=seger_seurat) seger_seurat <- ScaleData(object=seger_seurat) seger_seurat <- FindVariableGenes(object=seger_seurat, do.plot=TRUE) Eventhough Seurat corrects for the relationship between dispersion and mean expression, it doesn’t use the corrected value when ranking features. Compare the results of the command below with the results in the plots above: head(muraro_seurat@hvg.info, 50) head(seger_seurat@hvg.info, 50) But we will follow their example and use the top 2000 most dispersed genes withmerged_seurat correcting for mean expression from each dataset anyway. gene.use <- union(rownames(x = head(x = muraro_seurat@hvg.info, n = 2000)), rownames(x = head(x = seger_seurat@hvg.info, n = 2000))) Exercise Find the features we would use if we selected the top 2000 most dispersed after scaling by mean. (Hint: consider the order function) Answer Now we will run CCA to find the shared correlation structure for these two datasets: Note to speed up the calculations we will be using only the top 5 dimensions but ideally you would consider many more and then select the top most informative ones using DimHeatmap. merged_seurat <- RunCCA(object=muraro_seurat, object2=seger_seurat, genes.use=gene.use, add.cell.id1="m", add.cell.id2="s", num.cc = 5) DimPlot(object = merged_seurat, reduction.use = "cca", group.by = "dataset", pt.size = 0.5) # Before correcting To identify dataset specific cell-types we compare how well cells are ‘explained’ by CCA vs dataset-specific principal component analysis. merged_seurat <- CalcVarExpRatio(object = merged_seurat, reduction.type = "pca", grouping.var = "dataset", dims.use = 1:5) merged.all <- merged_seurat merged_seurat <- SubsetData(object=merged_seurat, subset.name="var.ratio.pca", accept.low = 0.5) # CCA > 1/2 as good as PCA merged.discard <- SubsetData(object=merged.all, subset.name="var.ratio.pca", accept.high = 0.5) summary(factor(merged.discard@meta.data$celltype)) # check the cell-type of the discarded cells. Here we can see that despite both datasets containing endothelial cells, almost all of them have been discarded as “dataset-specificâ€. Now we can align the datasets: merged_seurat <- AlignSubspace(object = merged_seurat, reduction.type = "cca", grouping.var = "dataset", dims.align = 1:5) DimPlot(object = merged_seurat, reduction.use = "cca.aligned", group.by = "dataset", pt.size = 0.5) # After aligning subspaces Exercise Compare the results for if you use the features after scaling dispersions. Answer Advanced Exercise Use the clustering methods we previously covered on the combined datasets. Do you identify any novel cell-types? 14.0.8 sessionInfo() 14.1 Search scRNA-Seq data library(scfind) library(SingleCellExperiment) library(plotly) set.seed(1234567) 14.1.1 About scfind is a tool that allows one to search single cell RNA-Seq collections (Atlas) using lists of genes, e.g. searching for cells and cell-types where a specific set of genes are expressed. scfind is a Github package. Cloud implementation of scfind with a large collection of datasets is available on our website. 14.1.2 Dataset We will run scfind on the Tabula Muris 10X dataset. scfind also operates on SingleCellExperiment class: tm10x_heart <- readRDS("data/sce/Heart_10X.rds") tm10x_heart colData(tm10x_heart) 14.1.3 Gene Index Now we need to create a gene index using our dataset: heart_index <- buildCellTypeIndex( tm10x_heart, cell_type_column = "cell_type1" ) scfind adopts a two-step compression strategy which allows efficient compression of large cell-by-gene matrix and allows fast retrieval of data by gene query. We estimated that one can achieve 2 orders of magnitude compression with this method. The input matrix for indexing is the raw count matrix of the SingleCellExperiment class. By default the cell_type1 column of the colData slot of the SingleCellExperiment object is used to define cell types, however it can also be defined manually using the cell.type.label argument of the buildCellTypeIndex. For dataset with more than one tissue, you can also merge all tissues together to create a super index using the function mergeDataset. The index can be saved in .rds format using saveObject function and loaded using loadObject function for future use. tm10x_thymus <- readRDS("data/sce/Thymus_10X.rds") thymus_index <- buildCellTypeIndex( tm10x_thymus, cell_type_column = "cell_type1" ) ## scfind_index <- mergeDataset(heart_index, thymus_index) ## scfind_index@datasets ## cellTypeNames(scfind_index) ## sample(scfindGenes(scfind_index),20) To quickly and easily find the enriched cell type using an interactive Shiny application use the following method: 14.1.4 Marker genes Now let’s find the marker genes for Thymus T cell in the datasets # Showing the top 5 marker genes for each cell type and sort by F1 score. t_cell_markers <- cellTypeMarkers(scfind_index, cell.types = "Thymus.T cell", top.k = 5, sort.field = "f1") t_cell_markers Next, you can evaluate the markers of Thymus T cell in Thymus stromal cell evaluateMarkers( scfind_index, gene.list = as.character(t_cell_markers$genes), cell.types = "Thymus.stromal cell", sort.field = "f1" ) # By default, the marker evaluation takes all cell types in the dataset as background cell type, but you can use the argument `background.cell.types` to fine tune the evaluation background <- cellTypeNames(scfind_index, datasets = "Thymus") background evaluateMarkers( scfind_index, gene.list = as.character(t_cell_markers$genes), cell.types = "Thymus.stromal cell", sort.field = "f1", background.cell.types = background ) 14.1.5 Search cells by a gene list scfind can instantly identify the cell type that best represents the genes of interest from large single cell dataset. We will use the marker genes identified in an original publication Yanbin et al. 2015. Cardiomyocyte-specific markers used in immunostaining as shown in Figure 1. cardiomyocytes <- c("Mef2c", "Gata4", "Nkx2.5", "Myh6", "tnnt2", "tnni3", "CDH2", "Cx43", "GJA1") result <- markerGenes( scfind_index, gene.list = cardiomyocytes ) result To allow search of enriched cell type from a long list of gene query, scfind features a query optimization routine. First, the function markerGenes will counter suggest subqueries that with the highest support in the dataset. The TF-IDF score for each gene set allows user to identify the best subquery for finding the most relevant cell type. best_subquery <- result[which.max(result$tfidf),] # get the best subquery by ranking TF-IDF score best_subquery <- strsplit(as.character(best_subquery$Query), ",")[[1]] # obtain gene list hyperQueryCellTypes( scfind_index, gene.list = best_subquery ) hyperQueryCellTypes function returns a list of p-values corresponding to all cell types in a given dataset. It also outputs a list of cells in which genes from the given gene list are co-expressed. Exercise 1 Find the marker genes of all cell types in the Heart dataset cardiac_contractility <- c("Ace2","Fkbp1b","Gh","Cacna1c","Cd59b","Ppp1r1a","Tnnt2","Nos1","Agtr1a","Camk2g","Grk2","Ins2","Dnah8","Igf1","Nos3","Nppa","Nppb","Il6","Myh6","Ren2","Tnni3","Apln","Kcnmb1","Pik3cg","Prkca","Aplnr","Slc8a1","Ace","Akt1","Edn1","Kcnmb2","Nos2","Tnf","Myh14","Adrb2","Agt","Adrb1","Atp2a2","Ryr2","Pln") Exercise 2 Input the gene list relevant to “cardiac contractility†and find the best gene set with the highest support. Identify the enriched cell type for this query. 14.1.6 In-silico gating Using the findCellTypes function, you can perform in-silico gating to identify cell type subsets as if the way cell sorting works. To do so, you can add logical operators including “-†and "*" for “no†and “intermediate†expression, respectively in front of the gene name. Here, we use operators to subset T cell of the Thymus dataset into effector T regulatory cells and effector memory T cell. effector_t_reg_cells <- c("*Ptprc", "-Il7r", "Ctla4", "-Il7r") effector_memory_t_cells <- c("-Il2ra", "*Ptprc", "Il7r") subset_treg <- findCellTypes(scfind_index, effector_t_reg_cells, "Thymus") subset_tmem <- findCellTypes(scfind_index, effector_memory_t_cells, "Thymus") subset_treg subset_tmem Let’s use the TSNE plot information from the SingleCellExperiment of Thymus to illustrate the gating result map <- data.frame( tm10x_thymus@reducedDims[['TSNE']], cell_type = as.character(colData(tm10x_thymus)$cell_type1), stringsAsFactors = F ) map <- subset(map, cell_type == "T cell") plot_ly(map, x = ~X1 , y = ~X2, type="scatter") map$cell_type[subset_treg$`Thymus.T cell`] <- "Effector T Regulatory Cell" map$cell_type[subset_tmem$`Thymus.T cell`] <- "Effector Memory T Cell" plot_ly(map, x = ~X1 , y = ~X2, type="scatter", color = ~cell_type) 14.1.7 sessionInfo() sessionInfo() References "], ["integrating-single-cell-omics-datasets.html", "15 Integrating single-cell ’omics datasets", " 15 Integrating single-cell ’omics datasets set.seed(1234567) "], diff --git a/public/seurat-chapter.html b/public/seurat-chapter.html index ab470e905d1d3b9379a4e1a3e45e2402545fe5f8..1d075bd3ba28679eb6b1c32e1d9fe12e74e4ad32 100644 --- a/public/seurat-chapter.html +++ b/public/seurat-chapter.html @@ -339,7 +339,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.4.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-3"><i class="fa fa-check"></i><b>7.4.6</b> sessionInfo()</a></li> </ul></li> <li class="chapter" data-level="7.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#identifying-confounding-factors-reads"><i class="fa fa-check"></i><b>7.5</b> Identifying confounding factors (Reads)</a></li> -<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#dealing-with-confounders"><i class="fa fa-check"></i><b>7.6</b> Dealing with confounders</a><ul> +<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#batch-effects"><i class="fa fa-check"></i><b>7.6</b> Batch effects</a><ul> <li class="chapter" data-level="7.6.1" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#introduction-6"><i class="fa fa-check"></i><b>7.6.1</b> Introduction</a></li> <li class="chapter" data-level="7.6.2" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#linear-models"><i class="fa fa-check"></i><b>7.6.2</b> Linear models</a></li> <li class="chapter" data-level="7.6.3" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sctransform-2"><i class="fa fa-check"></i><b>7.6.3</b> sctransform</a></li> @@ -347,7 +347,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.6.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#combat"><i class="fa fa-check"></i><b>7.6.5</b> Combat</a></li> <li class="chapter" data-level="7.6.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#mnncorrect"><i class="fa fa-check"></i><b>7.6.6</b> mnnCorrect</a></li> <li class="chapter" data-level="7.6.7" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#harmony"><i class="fa fa-check"></i><b>7.6.7</b> Harmony</a></li> -<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-confounder-removal-strategies"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare confounder removal strategies</a></li> +<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-batch-correction"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare batch correction</a></li> <li class="chapter" data-level="7.6.9" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#big-exercise-2"><i class="fa fa-check"></i><b>7.6.9</b> Big Exercise</a></li> <li class="chapter" data-level="7.6.10" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-4"><i class="fa fa-check"></i><b>7.6.10</b> sessionInfo()</a></li> </ul></li> @@ -370,14 +370,13 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="9.1.2" data-path="latent-spaces.html"><a href="latent-spaces.html#tsne-t-distributed-stochastic-neighbor-embedding"><i class="fa fa-check"></i><b>9.1.2</b> tSNE: t-Distributed Stochastic Neighbor Embedding</a></li> <li class="chapter" data-level="9.1.3" data-path="latent-spaces.html"><a href="latent-spaces.html#manifold-methods"><i class="fa fa-check"></i><b>9.1.3</b> Manifold methods</a></li> </ul></li> -<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a></li> +<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a><ul> +<li class="chapter" data-level="9.2.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom-interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.2.1</b> <span>Slalom</span>: Interpretable latent spaces</a></li> +</ul></li> <li class="chapter" data-level="9.3" data-path="latent-spaces.html"><a href="latent-spaces.html#autoencoders"><i class="fa fa-check"></i><b>9.3</b> Autoencoders</a><ul> <li class="chapter" data-level="9.3.1" data-path="latent-spaces.html"><a href="latent-spaces.html#background-and-some-notations"><i class="fa fa-check"></i><b>9.3.1</b> Background and some notations</a></li> <li class="chapter" data-level="9.3.2" data-path="latent-spaces.html"><a href="latent-spaces.html#objective"><i class="fa fa-check"></i><b>9.3.2</b> Objective</a></li> </ul></li> -<li class="chapter" data-level="9.4" data-path="latent-spaces.html"><a href="latent-spaces.html#interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.4</b> Interpretable latent spaces</a><ul> -<li class="chapter" data-level="9.4.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom"><i class="fa fa-check"></i><b>9.4.1</b> Slalom</a></li> -</ul></li> </ul></li> <li class="chapter" data-level="10" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html"><i class="fa fa-check"></i><b>10</b> Clustering and cell annotation</a><ul> <li class="chapter" data-level="10.1" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html#clustering-methods"><i class="fa fa-check"></i><b>10.1</b> Clustering Methods</a><ul> @@ -399,15 +398,16 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="11.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#first-look-at-deng-data"><i class="fa fa-check"></i><b>11.1</b> First look at Deng data</a><ul> <li class="chapter" data-level="11.1.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#tscan"><i class="fa fa-check"></i><b>11.1.1</b> TSCAN</a></li> <li class="chapter" data-level="11.1.2" data-path="trajectory-inference.html"><a href="trajectory-inference.html#slingshot"><i class="fa fa-check"></i><b>11.1.2</b> Slingshot</a></li> -<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.3</b> Monocle</a></li> -<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.4</b> Monocle 2</a></li> -<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 3</a></li> -<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.6</b> Diffusion maps</a></li> -<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.7</b> Other methods</a></li> -<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.8</b> Comparison of the methods</a></li> -<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.9</b> Expression of genes through time</a></li> -<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.10</b> dynverse</a></li> -<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.11</b> sessionInfo()</a></li> +<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#gam-general-additive-model-for-identifying-temporally-expressed-genes"><i class="fa fa-check"></i><b>11.1.3</b> GAM general additive model for identifying temporally expressed genes</a></li> +<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.4</b> Monocle</a></li> +<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 2</a></li> +<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.6</b> Monocle 3</a></li> +<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.7</b> Diffusion maps</a></li> +<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.8</b> Other methods</a></li> +<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.9</b> Comparison of the methods</a></li> +<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.10</b> Expression of genes through time</a></li> +<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.11</b> dynverse</a></li> +<li class="chapter" data-level="11.1.12" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.12</b> sessionInfo()</a></li> </ul></li> </ul></li> <li class="chapter" data-level="12" data-path="dechapter.html"><a href="dechapter.html"><i class="fa fa-check"></i><b>12</b> Differential Expression (DE) analysis</a><ul> @@ -522,26 +522,26 @@ sequenced on the Illumina NextSeq 500. The raw data can be found <p>We start by reading in the data. All features in Seurat have been configured to work with sparse matrices which results in significant memory and speed savings for Drop-seq/inDrop/10x data.</p> -<div class="sourceCode" id="cb784"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb784-1" data-line-number="1"><span class="kw">library</span>(Seurat)</a> -<a class="sourceLine" id="cb784-2" data-line-number="2"><span class="kw">library</span>(dplyr)</a> -<a class="sourceLine" id="cb784-3" data-line-number="3"><span class="kw">library</span>(cowplot)</a> -<a class="sourceLine" id="cb784-4" data-line-number="4"></a> -<a class="sourceLine" id="cb784-5" data-line-number="5"><span class="co"># Load the PBMC dataset</span></a> -<a class="sourceLine" id="cb784-6" data-line-number="6">pbmc.data <-<span class="st"> </span><span class="kw">Read10X</span>(<span class="dt">data.dir =</span> <span class="st">"data/pbmc3k_filtered_gene_bc_matrices/hg19/"</span>)</a> -<a class="sourceLine" id="cb784-7" data-line-number="7"></a> -<a class="sourceLine" id="cb784-8" data-line-number="8"><span class="co"># Examine the memory savings between regular and sparse matrices</span></a> -<a class="sourceLine" id="cb784-9" data-line-number="9">dense.size <-<span class="st"> </span><span class="kw">object.size</span>(<span class="dt">x =</span> <span class="kw">as.matrix</span>(<span class="dt">x =</span> pbmc.data))</a> -<a class="sourceLine" id="cb784-10" data-line-number="10">dense.size</a> -<a class="sourceLine" id="cb784-11" data-line-number="11"></a> -<a class="sourceLine" id="cb784-12" data-line-number="12">sparse.size <-<span class="st"> </span><span class="kw">object.size</span>(<span class="dt">x =</span> pbmc.data)</a> -<a class="sourceLine" id="cb784-13" data-line-number="13">sparse.size</a> -<a class="sourceLine" id="cb784-14" data-line-number="14"></a> -<a class="sourceLine" id="cb784-15" data-line-number="15">dense.size<span class="op">/</span>sparse.size</a> -<a class="sourceLine" id="cb784-16" data-line-number="16"></a> -<a class="sourceLine" id="cb784-17" data-line-number="17"><span class="co"># Initialize the Seurat object with the raw (non-normalized data). Keep all</span></a> -<a class="sourceLine" id="cb784-18" data-line-number="18"><span class="co"># genes expressed in >= 3 cells (~0.1% of the data). Keep all cells with at</span></a> -<a class="sourceLine" id="cb784-19" data-line-number="19"><span class="co"># least 200 detected genes</span></a> -<a class="sourceLine" id="cb784-20" data-line-number="20">pbmc <-<span class="st"> </span><span class="kw">CreateSeuratObject</span>(<span class="dt">counts =</span> pbmc.data, <span class="dt">min.cells =</span> <span class="dv">3</span>, <span class="dt">min.features =</span> <span class="dv">200</span>, <span class="dt">project =</span> <span class="st">"10X_PBMC"</span>, <span class="dt">assay =</span> <span class="st">"RNA"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb867"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb867-1" data-line-number="1"><span class="kw">library</span>(Seurat)</a> +<a class="sourceLine" id="cb867-2" data-line-number="2"><span class="kw">library</span>(dplyr)</a> +<a class="sourceLine" id="cb867-3" data-line-number="3"><span class="kw">library</span>(cowplot)</a> +<a class="sourceLine" id="cb867-4" data-line-number="4"></a> +<a class="sourceLine" id="cb867-5" data-line-number="5"><span class="co"># Load the PBMC dataset</span></a> +<a class="sourceLine" id="cb867-6" data-line-number="6">pbmc.data <-<span class="st"> </span><span class="kw">Read10X</span>(<span class="dt">data.dir =</span> <span class="st">"data/pbmc3k_filtered_gene_bc_matrices/hg19/"</span>)</a> +<a class="sourceLine" id="cb867-7" data-line-number="7"></a> +<a class="sourceLine" id="cb867-8" data-line-number="8"><span class="co"># Examine the memory savings between regular and sparse matrices</span></a> +<a class="sourceLine" id="cb867-9" data-line-number="9">dense.size <-<span class="st"> </span><span class="kw">object.size</span>(<span class="dt">x =</span> <span class="kw">as.matrix</span>(<span class="dt">x =</span> pbmc.data))</a> +<a class="sourceLine" id="cb867-10" data-line-number="10">dense.size</a> +<a class="sourceLine" id="cb867-11" data-line-number="11"></a> +<a class="sourceLine" id="cb867-12" data-line-number="12">sparse.size <-<span class="st"> </span><span class="kw">object.size</span>(<span class="dt">x =</span> pbmc.data)</a> +<a class="sourceLine" id="cb867-13" data-line-number="13">sparse.size</a> +<a class="sourceLine" id="cb867-14" data-line-number="14"></a> +<a class="sourceLine" id="cb867-15" data-line-number="15">dense.size<span class="op">/</span>sparse.size</a> +<a class="sourceLine" id="cb867-16" data-line-number="16"></a> +<a class="sourceLine" id="cb867-17" data-line-number="17"><span class="co"># Initialize the Seurat object with the raw (non-normalized data). Keep all</span></a> +<a class="sourceLine" id="cb867-18" data-line-number="18"><span class="co"># genes expressed in >= 3 cells (~0.1% of the data). Keep all cells with at</span></a> +<a class="sourceLine" id="cb867-19" data-line-number="19"><span class="co"># least 200 detected genes</span></a> +<a class="sourceLine" id="cb867-20" data-line-number="20">pbmc <-<span class="st"> </span><span class="kw">CreateSeuratObject</span>(<span class="dt">counts =</span> pbmc.data, <span class="dt">min.cells =</span> <span class="dv">3</span>, <span class="dt">min.features =</span> <span class="dv">200</span>, <span class="dt">project =</span> <span class="st">"10X_PBMC"</span>, <span class="dt">assay =</span> <span class="st">"RNA"</span>)</a></code></pre></div> </div> <div id="standard-pre-processing-workflow" class="section level2"> <h2><span class="header-section-number">16.2</span> Standard pre-processing workflow</h2> @@ -561,38 +561,38 @@ of genes detected as potential multiplets. Of course this is not a guaranteed method to exclude cell doublets, but we include this as an example of filtering user-defined outlier cells. We also filter cells based on the percentage of mitochondrial genes present.</p> -<div class="sourceCode" id="cb785"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb785-1" data-line-number="1"><span class="co"># The number of genes and UMIs (nGene and nUMI) are automatically calculated</span></a> -<a class="sourceLine" id="cb785-2" data-line-number="2"><span class="co"># for every object by Seurat. For non-UMI data, nUMI represents the sum of</span></a> -<a class="sourceLine" id="cb785-3" data-line-number="3"><span class="co"># the non-normalized values within a cell We calculate the percentage of</span></a> -<a class="sourceLine" id="cb785-4" data-line-number="4"><span class="co"># mitochondrial genes here and store it in percent.mito using AddMetaData.</span></a> -<a class="sourceLine" id="cb785-5" data-line-number="5"><span class="co"># We use object@raw.data since this represents non-transformed and</span></a> -<a class="sourceLine" id="cb785-6" data-line-number="6"><span class="co"># non-log-normalized counts The % of UMI mapping to MT-genes is a common</span></a> -<a class="sourceLine" id="cb785-7" data-line-number="7"><span class="co"># scRNA-seq QC metric.</span></a> -<a class="sourceLine" id="cb785-8" data-line-number="8">mito.genes <-<span class="st"> </span><span class="kw">grep</span>(<span class="dt">pattern =</span> <span class="st">"^MT-"</span>, <span class="dt">x =</span> <span class="kw">rownames</span>(pbmc<span class="op">@</span>assays[[<span class="st">"RNA"</span>]]), <span class="dt">value =</span> <span class="ot">TRUE</span>)</a> -<a class="sourceLine" id="cb785-9" data-line-number="9"></a> -<a class="sourceLine" id="cb785-10" data-line-number="10">percent.mito <-<span class="st"> </span>Matrix<span class="op">::</span><span class="kw">colSums</span>(pbmc<span class="op">@</span>assays[[<span class="st">"RNA"</span>]][mito.genes, ])<span class="op">/</span>Matrix<span class="op">::</span><span class="kw">colSums</span>(pbmc<span class="op">@</span>assays[[<span class="st">"RNA"</span>]])</a> -<a class="sourceLine" id="cb785-11" data-line-number="11"></a> -<a class="sourceLine" id="cb785-12" data-line-number="12"><span class="co"># AddMetaData adds columns to object@meta.data, and is a great place to</span></a> -<a class="sourceLine" id="cb785-13" data-line-number="13"><span class="co"># stash QC stats</span></a> -<a class="sourceLine" id="cb785-14" data-line-number="14"></a> -<a class="sourceLine" id="cb785-15" data-line-number="15"><span class="co">#Seurat v2 function, but shows compatibility in Seurat v3</span></a> -<a class="sourceLine" id="cb785-16" data-line-number="16">pbmc <-<span class="st"> </span><span class="kw">AddMetaData</span>(<span class="dt">object =</span> pbmc, <span class="dt">metadata =</span> percent.mito, <span class="dt">col.name =</span> <span class="st">"percent.mito"</span>) </a> -<a class="sourceLine" id="cb785-17" data-line-number="17"><span class="co">#in case the above function does not work simply do:</span></a> -<a class="sourceLine" id="cb785-18" data-line-number="18">pbmc<span class="op">$</span>percent.mito <-<span class="st"> </span>percent.mito</a> -<a class="sourceLine" id="cb785-19" data-line-number="19"></a> -<a class="sourceLine" id="cb785-20" data-line-number="20"><span class="kw">VlnPlot</span>(<span class="dt">object =</span> pbmc, <span class="dt">features =</span> <span class="kw">c</span>(<span class="st">"nFeature_RNA"</span>, <span class="st">"nCount_RNA"</span>, <span class="st">"percent.mito"</span>), <span class="dt">ncol =</span> <span class="dv">3</span>)</a></code></pre></div> -<div class="sourceCode" id="cb786"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb786-1" data-line-number="1"><span class="co"># GenePlot is typically used to visualize gene-gene relationships, but can</span></a> -<a class="sourceLine" id="cb786-2" data-line-number="2"><span class="co"># be used for anything calculated by the object, i.e. columns in</span></a> -<a class="sourceLine" id="cb786-3" data-line-number="3"><span class="co"># object@meta.data, PC scores etc. Since there is a rare subset of cells</span></a> -<a class="sourceLine" id="cb786-4" data-line-number="4"><span class="co"># with an outlier level of high mitochondrial percentage and also low UMI</span></a> -<a class="sourceLine" id="cb786-5" data-line-number="5"><span class="co"># content, we filter these as well</span></a> -<a class="sourceLine" id="cb786-6" data-line-number="6"><span class="kw">par</span>(<span class="dt">mfrow =</span> <span class="kw">c</span>(<span class="dv">1</span>, <span class="dv">2</span>))</a> -<a class="sourceLine" id="cb786-7" data-line-number="7"><span class="kw">FeatureScatter</span>(<span class="dt">object =</span> pbmc, <span class="dt">feature1 =</span> <span class="st">"nCount_RNA"</span>, <span class="dt">feature2 =</span> <span class="st">"percent.mito"</span>)</a> -<a class="sourceLine" id="cb786-8" data-line-number="8"><span class="kw">FeatureScatter</span>(<span class="dt">object =</span> pbmc, <span class="dt">feature1 =</span> <span class="st">"nCount_RNA"</span>, <span class="dt">feature2 =</span> <span class="st">"nFeature_RNA"</span>)</a></code></pre></div> -<div class="sourceCode" id="cb787"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb787-1" data-line-number="1"><span class="co"># We filter out cells that have unique gene counts (nFeature_RNA) over 2,500 or less than</span></a> -<a class="sourceLine" id="cb787-2" data-line-number="2"><span class="co"># 200 Note that > and < are used to define a'gate'. </span></a> -<a class="sourceLine" id="cb787-3" data-line-number="3"><span class="co">#-Inf and Inf should be used if you don't want a lower or upper threshold.</span></a> -<a class="sourceLine" id="cb787-4" data-line-number="4">pbmc <-<span class="st"> </span><span class="kw">subset</span>(<span class="dt">x =</span> pbmc, <span class="dt">subset =</span> nFeature_RNA <span class="op">></span><span class="st"> </span><span class="dv">200</span> <span class="op">&</span><span class="st"> </span>nFeature_RNA <span class="op"><</span><span class="st"> </span><span class="dv">2500</span> <span class="op">&</span><span class="st"> </span>percent.mito <span class="op">></span><span class="st"> </span><span class="op">-</span><span class="ot">Inf</span> <span class="op">&</span><span class="st"> </span>percent.mito <span class="op"><</span><span class="st"> </span><span class="fl">0.05</span> )</a></code></pre></div> +<div class="sourceCode" id="cb868"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb868-1" data-line-number="1"><span class="co"># The number of genes and UMIs (nGene and nUMI) are automatically calculated</span></a> +<a class="sourceLine" id="cb868-2" data-line-number="2"><span class="co"># for every object by Seurat. For non-UMI data, nUMI represents the sum of</span></a> +<a class="sourceLine" id="cb868-3" data-line-number="3"><span class="co"># the non-normalized values within a cell We calculate the percentage of</span></a> +<a class="sourceLine" id="cb868-4" data-line-number="4"><span class="co"># mitochondrial genes here and store it in percent.mito using AddMetaData.</span></a> +<a class="sourceLine" id="cb868-5" data-line-number="5"><span class="co"># We use object@raw.data since this represents non-transformed and</span></a> +<a class="sourceLine" id="cb868-6" data-line-number="6"><span class="co"># non-log-normalized counts The % of UMI mapping to MT-genes is a common</span></a> +<a class="sourceLine" id="cb868-7" data-line-number="7"><span class="co"># scRNA-seq QC metric.</span></a> +<a class="sourceLine" id="cb868-8" data-line-number="8">mito.genes <-<span class="st"> </span><span class="kw">grep</span>(<span class="dt">pattern =</span> <span class="st">"^MT-"</span>, <span class="dt">x =</span> <span class="kw">rownames</span>(pbmc<span class="op">@</span>assays[[<span class="st">"RNA"</span>]]), <span class="dt">value =</span> <span class="ot">TRUE</span>)</a> +<a class="sourceLine" id="cb868-9" data-line-number="9"></a> +<a class="sourceLine" id="cb868-10" data-line-number="10">percent.mito <-<span class="st"> </span>Matrix<span class="op">::</span><span class="kw">colSums</span>(pbmc<span class="op">@</span>assays[[<span class="st">"RNA"</span>]][mito.genes, ])<span class="op">/</span>Matrix<span class="op">::</span><span class="kw">colSums</span>(pbmc<span class="op">@</span>assays[[<span class="st">"RNA"</span>]])</a> +<a class="sourceLine" id="cb868-11" data-line-number="11"></a> +<a class="sourceLine" id="cb868-12" data-line-number="12"><span class="co"># AddMetaData adds columns to object@meta.data, and is a great place to</span></a> +<a class="sourceLine" id="cb868-13" data-line-number="13"><span class="co"># stash QC stats</span></a> +<a class="sourceLine" id="cb868-14" data-line-number="14"></a> +<a class="sourceLine" id="cb868-15" data-line-number="15"><span class="co">#Seurat v2 function, but shows compatibility in Seurat v3</span></a> +<a class="sourceLine" id="cb868-16" data-line-number="16">pbmc <-<span class="st"> </span><span class="kw">AddMetaData</span>(<span class="dt">object =</span> pbmc, <span class="dt">metadata =</span> percent.mito, <span class="dt">col.name =</span> <span class="st">"percent.mito"</span>) </a> +<a class="sourceLine" id="cb868-17" data-line-number="17"><span class="co">#in case the above function does not work simply do:</span></a> +<a class="sourceLine" id="cb868-18" data-line-number="18">pbmc<span class="op">$</span>percent.mito <-<span class="st"> </span>percent.mito</a> +<a class="sourceLine" id="cb868-19" data-line-number="19"></a> +<a class="sourceLine" id="cb868-20" data-line-number="20"><span class="kw">VlnPlot</span>(<span class="dt">object =</span> pbmc, <span class="dt">features =</span> <span class="kw">c</span>(<span class="st">"nFeature_RNA"</span>, <span class="st">"nCount_RNA"</span>, <span class="st">"percent.mito"</span>), <span class="dt">ncol =</span> <span class="dv">3</span>)</a></code></pre></div> +<div class="sourceCode" id="cb869"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb869-1" data-line-number="1"><span class="co"># GenePlot is typically used to visualize gene-gene relationships, but can</span></a> +<a class="sourceLine" id="cb869-2" data-line-number="2"><span class="co"># be used for anything calculated by the object, i.e. columns in</span></a> +<a class="sourceLine" id="cb869-3" data-line-number="3"><span class="co"># object@meta.data, PC scores etc. Since there is a rare subset of cells</span></a> +<a class="sourceLine" id="cb869-4" data-line-number="4"><span class="co"># with an outlier level of high mitochondrial percentage and also low UMI</span></a> +<a class="sourceLine" id="cb869-5" data-line-number="5"><span class="co"># content, we filter these as well</span></a> +<a class="sourceLine" id="cb869-6" data-line-number="6"><span class="kw">par</span>(<span class="dt">mfrow =</span> <span class="kw">c</span>(<span class="dv">1</span>, <span class="dv">2</span>))</a> +<a class="sourceLine" id="cb869-7" data-line-number="7"><span class="kw">FeatureScatter</span>(<span class="dt">object =</span> pbmc, <span class="dt">feature1 =</span> <span class="st">"nCount_RNA"</span>, <span class="dt">feature2 =</span> <span class="st">"percent.mito"</span>)</a> +<a class="sourceLine" id="cb869-8" data-line-number="8"><span class="kw">FeatureScatter</span>(<span class="dt">object =</span> pbmc, <span class="dt">feature1 =</span> <span class="st">"nCount_RNA"</span>, <span class="dt">feature2 =</span> <span class="st">"nFeature_RNA"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb870"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb870-1" data-line-number="1"><span class="co"># We filter out cells that have unique gene counts (nFeature_RNA) over 2,500 or less than</span></a> +<a class="sourceLine" id="cb870-2" data-line-number="2"><span class="co"># 200 Note that > and < are used to define a'gate'. </span></a> +<a class="sourceLine" id="cb870-3" data-line-number="3"><span class="co">#-Inf and Inf should be used if you don't want a lower or upper threshold.</span></a> +<a class="sourceLine" id="cb870-4" data-line-number="4">pbmc <-<span class="st"> </span><span class="kw">subset</span>(<span class="dt">x =</span> pbmc, <span class="dt">subset =</span> nFeature_RNA <span class="op">></span><span class="st"> </span><span class="dv">200</span> <span class="op">&</span><span class="st"> </span>nFeature_RNA <span class="op"><</span><span class="st"> </span><span class="dv">2500</span> <span class="op">&</span><span class="st"> </span>percent.mito <span class="op">></span><span class="st"> </span><span class="op">-</span><span class="ot">Inf</span> <span class="op">&</span><span class="st"> </span>percent.mito <span class="op"><</span><span class="st"> </span><span class="fl">0.05</span> )</a></code></pre></div> </div> <div id="normalizing-the-data" class="section level2"> <h2><span class="header-section-number">16.4</span> Normalizing the data</h2> @@ -601,7 +601,7 @@ the data. By default, we employ a global-scaling normalization method “LogNormalize†that normalizes the gene expression measurements for each cell by the total expression, multiplies this by a scale factor (10,000 by default), and log-transforms the result.</p> -<div class="sourceCode" id="cb788"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb788-1" data-line-number="1">pbmc <-<span class="st"> </span><span class="kw">NormalizeData</span>(<span class="dt">object =</span> pbmc, <span class="dt">normalization.method =</span> <span class="st">"LogNormalize"</span>, <span class="dt">scale.factor =</span> <span class="dv">10000</span>)</a></code></pre></div> +<div class="sourceCode" id="cb871"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb871-1" data-line-number="1">pbmc <-<span class="st"> </span><span class="kw">NormalizeData</span>(<span class="dt">object =</span> pbmc, <span class="dt">normalization.method =</span> <span class="st">"LogNormalize"</span>, <span class="dt">scale.factor =</span> <span class="dv">10000</span>)</a></code></pre></div> </div> <div id="detection-of-variable-genes-across-the-single-cells" class="section level2"> <h2><span class="header-section-number">16.5</span> Detection of variable genes across the single cells</h2> @@ -616,9 +616,9 @@ dispersion plot, but the exact parameter settings may vary based on the data type, heterogeneity in the sample, and normalization strategy. The parameters here identify ~2,000 variable genes, and represent typical parameter settings for UMI data that is normalized to a total of 1e4 molecules.</p> -<div class="sourceCode" id="cb789"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb789-1" data-line-number="1">pbmc <-<span class="st"> </span><span class="kw">FindVariableFeatures</span>(<span class="dt">object =</span> pbmc, <span class="dt">mean.function =</span> ExpMean, <span class="dt">dispersion.function =</span> LogVMR, <span class="dt">x.low.cutoff =</span> <span class="fl">0.0125</span>, <span class="dt">x.high.cutoff =</span> <span class="dv">3</span>, <span class="dt">y.cutoff =</span> <span class="fl">0.5</span>, <span class="dt">nfeatures =</span> <span class="dv">2000</span>)</a></code></pre></div> +<div class="sourceCode" id="cb872"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb872-1" data-line-number="1">pbmc <-<span class="st"> </span><span class="kw">FindVariableFeatures</span>(<span class="dt">object =</span> pbmc, <span class="dt">mean.function =</span> ExpMean, <span class="dt">dispersion.function =</span> LogVMR, <span class="dt">x.low.cutoff =</span> <span class="fl">0.0125</span>, <span class="dt">x.high.cutoff =</span> <span class="dv">3</span>, <span class="dt">y.cutoff =</span> <span class="fl">0.5</span>, <span class="dt">nfeatures =</span> <span class="dv">2000</span>)</a></code></pre></div> <p>To view the output of the FindVariableFeatures output we use this function. The genes appear not to be stored in the object, but can be accessed this way.</p> -<div class="sourceCode" id="cb790"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb790-1" data-line-number="1"><span class="kw">head</span>(<span class="dt">x =</span> <span class="kw">HVFInfo</span>(<span class="dt">object =</span> pbmc))</a></code></pre></div> +<div class="sourceCode" id="cb873"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb873-1" data-line-number="1"><span class="kw">head</span>(<span class="dt">x =</span> <span class="kw">HVFInfo</span>(<span class="dt">object =</span> pbmc))</a></code></pre></div> </div> <div id="scaling-the-data-and-removing-unwanted-sources-of-variation" class="section level2"> <h2><span class="header-section-number">16.6</span> Scaling the data and removing unwanted sources of variation</h2> @@ -642,7 +642,7 @@ mitochondrial gene content.</p> <p>Seurat v2.0 implements this regression as part of the data scaling process. Therefore, the <code>RegressOut</code> function has been deprecated, and replaced with the vars.to.regress argument in <code>ScaleData</code>.</p> -<div class="sourceCode" id="cb791"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb791-1" data-line-number="1">pbmc <-<span class="st"> </span><span class="kw">ScaleData</span>(<span class="dt">object =</span> pbmc, <span class="dt">vars.to.regress =</span> <span class="kw">c</span>(<span class="st">"nCounts_RNA"</span>, <span class="st">"percent.mito"</span>))</a></code></pre></div> +<div class="sourceCode" id="cb874"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb874-1" data-line-number="1">pbmc <-<span class="st"> </span><span class="kw">ScaleData</span>(<span class="dt">object =</span> pbmc, <span class="dt">vars.to.regress =</span> <span class="kw">c</span>(<span class="st">"nCounts_RNA"</span>, <span class="st">"percent.mito"</span>))</a></code></pre></div> </div> <div id="perform-linear-dimensional-reduction" class="section level2"> <h2><span class="header-section-number">16.7</span> Perform linear dimensional reduction</h2> @@ -660,7 +660,7 @@ output PCA graph has always the same dimensions, indicating that the provided genes in the features argument are not exactly the ones used to compute PCA. Wether the function gets the HVG directly or does not take them into account, I don’t know.</p> -<div class="sourceCode" id="cb792"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb792-1" data-line-number="1">pbmc <-<span class="st"> </span><span class="kw">RunPCA</span>(<span class="dt">object =</span> pbmc, <span class="dt">npcs =</span> <span class="dv">30</span>, <span class="dt">verbose =</span> <span class="ot">FALSE</span>)</a></code></pre></div> +<div class="sourceCode" id="cb875"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb875-1" data-line-number="1">pbmc <-<span class="st"> </span><span class="kw">RunPCA</span>(<span class="dt">object =</span> pbmc, <span class="dt">npcs =</span> <span class="dv">30</span>, <span class="dt">verbose =</span> <span class="ot">FALSE</span>)</a></code></pre></div> <p>–> refered to Seurat v2: Seurat provides several useful ways of visualizing both cells and genes that define the PCA, including <code>PrintPCA</code>, <code>VizPCA</code>, <code>PCAPlot</code>, and <code>PCHeatmap</code></p> @@ -673,19 +673,19 @@ Seurat v3 provides functions for visualizing: - Variable Feature Plot - Violin and Ridge plots - Heatmaps</p> -<div class="sourceCode" id="cb793"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb793-1" data-line-number="1"><span class="co"># Examine and visualize PCA results a few different ways</span></a> -<a class="sourceLine" id="cb793-2" data-line-number="2"><span class="kw">DimPlot</span>(<span class="dt">object =</span> pbmc, <span class="dt">reduction =</span> <span class="st">"pca"</span>)</a></code></pre></div> -<div class="sourceCode" id="cb794"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb794-1" data-line-number="1"><span class="co"># Dimensional reduction plot, with cells colored by a quantitative feature</span></a> -<a class="sourceLine" id="cb794-2" data-line-number="2"><span class="kw">FeaturePlot</span>(<span class="dt">object =</span> pbmc, <span class="dt">features =</span> <span class="st">"MS4A1"</span>)</a></code></pre></div> -<div class="sourceCode" id="cb795"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb795-1" data-line-number="1"><span class="co"># Scatter plot across single cells, replaces GenePlot</span></a> -<a class="sourceLine" id="cb795-2" data-line-number="2"><span class="kw">FeatureScatter</span>(<span class="dt">object =</span> pbmc, <span class="dt">feature1 =</span> <span class="st">"MS4A1"</span>, <span class="dt">feature2 =</span> <span class="st">"PC_1"</span>)</a> -<a class="sourceLine" id="cb795-3" data-line-number="3"><span class="kw">FeatureScatter</span>(<span class="dt">object =</span> pbmc, <span class="dt">feature1 =</span> <span class="st">"MS4A1"</span>, <span class="dt">feature2 =</span> <span class="st">"CD3D"</span>)</a></code></pre></div> -<div class="sourceCode" id="cb796"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb796-1" data-line-number="1"><span class="co"># Scatter plot across individual features, repleaces CellPlot</span></a> -<a class="sourceLine" id="cb796-2" data-line-number="2"><span class="kw">CellScatter</span>(<span class="dt">object =</span> pbmc, <span class="dt">cell1 =</span> <span class="st">"AGTCTACTAGGGTG"</span>, <span class="dt">cell2 =</span> <span class="st">"CACAGATGGTTTCT"</span>)</a></code></pre></div> -<div class="sourceCode" id="cb797"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb797-1" data-line-number="1"><span class="kw">VariableFeaturePlot</span>(<span class="dt">object =</span> pbmc)</a></code></pre></div> -<div class="sourceCode" id="cb798"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb798-1" data-line-number="1"><span class="co"># Violin and Ridge plots</span></a> -<a class="sourceLine" id="cb798-2" data-line-number="2"><span class="kw">VlnPlot</span>(<span class="dt">object =</span> pbmc, <span class="dt">features =</span> <span class="kw">c</span>(<span class="st">"LYZ"</span>, <span class="st">"CCL5"</span>, <span class="st">"IL32"</span>))</a> -<a class="sourceLine" id="cb798-3" data-line-number="3"><span class="kw">RidgePlot</span>(<span class="dt">object =</span> pbmc, <span class="dt">feature =</span> <span class="kw">c</span>(<span class="st">"LYZ"</span>, <span class="st">"CCL5"</span>, <span class="st">"IL32"</span>))</a></code></pre></div> +<div class="sourceCode" id="cb876"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb876-1" data-line-number="1"><span class="co"># Examine and visualize PCA results a few different ways</span></a> +<a class="sourceLine" id="cb876-2" data-line-number="2"><span class="kw">DimPlot</span>(<span class="dt">object =</span> pbmc, <span class="dt">reduction =</span> <span class="st">"pca"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb877"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb877-1" data-line-number="1"><span class="co"># Dimensional reduction plot, with cells colored by a quantitative feature</span></a> +<a class="sourceLine" id="cb877-2" data-line-number="2"><span class="kw">FeaturePlot</span>(<span class="dt">object =</span> pbmc, <span class="dt">features =</span> <span class="st">"MS4A1"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb878"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb878-1" data-line-number="1"><span class="co"># Scatter plot across single cells, replaces GenePlot</span></a> +<a class="sourceLine" id="cb878-2" data-line-number="2"><span class="kw">FeatureScatter</span>(<span class="dt">object =</span> pbmc, <span class="dt">feature1 =</span> <span class="st">"MS4A1"</span>, <span class="dt">feature2 =</span> <span class="st">"PC_1"</span>)</a> +<a class="sourceLine" id="cb878-3" data-line-number="3"><span class="kw">FeatureScatter</span>(<span class="dt">object =</span> pbmc, <span class="dt">feature1 =</span> <span class="st">"MS4A1"</span>, <span class="dt">feature2 =</span> <span class="st">"CD3D"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb879"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb879-1" data-line-number="1"><span class="co"># Scatter plot across individual features, repleaces CellPlot</span></a> +<a class="sourceLine" id="cb879-2" data-line-number="2"><span class="kw">CellScatter</span>(<span class="dt">object =</span> pbmc, <span class="dt">cell1 =</span> <span class="st">"AGTCTACTAGGGTG"</span>, <span class="dt">cell2 =</span> <span class="st">"CACAGATGGTTTCT"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb880"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb880-1" data-line-number="1"><span class="kw">VariableFeaturePlot</span>(<span class="dt">object =</span> pbmc)</a></code></pre></div> +<div class="sourceCode" id="cb881"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb881-1" data-line-number="1"><span class="co"># Violin and Ridge plots</span></a> +<a class="sourceLine" id="cb881-2" data-line-number="2"><span class="kw">VlnPlot</span>(<span class="dt">object =</span> pbmc, <span class="dt">features =</span> <span class="kw">c</span>(<span class="st">"LYZ"</span>, <span class="st">"CCL5"</span>, <span class="st">"IL32"</span>))</a> +<a class="sourceLine" id="cb881-3" data-line-number="3"><span class="kw">RidgePlot</span>(<span class="dt">object =</span> pbmc, <span class="dt">feature =</span> <span class="kw">c</span>(<span class="st">"LYZ"</span>, <span class="st">"CCL5"</span>, <span class="st">"IL32"</span>))</a></code></pre></div> <p>In particular <code>DimHeatmap</code> allows for easy exploration of the primary sources of heterogeneity in a dataset, and can be useful when trying to decide which PCs to include for further downstream analyses. Both cells and genes are ordered @@ -693,8 +693,8 @@ according to their PCA scores. Setting cells.use to a number plots the ‘extrem cells on both ends of the spectrum, which dramatically speeds plotting for large datasets. Though clearly a supervised analysis, we find this to be a valuable tool for exploring correlated gene sets.</p> -<div class="sourceCode" id="cb799"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb799-1" data-line-number="1"><span class="co"># Heatmaps</span></a> -<a class="sourceLine" id="cb799-2" data-line-number="2"><span class="kw">DimHeatmap</span>(<span class="dt">object =</span> pbmc, <span class="dt">reduction =</span> <span class="st">"pca"</span>, <span class="dt">cells =</span> <span class="dv">200</span>, <span class="dt">balanced =</span> <span class="ot">TRUE</span>)</a></code></pre></div> +<div class="sourceCode" id="cb882"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb882-1" data-line-number="1"><span class="co"># Heatmaps</span></a> +<a class="sourceLine" id="cb882-2" data-line-number="2"><span class="kw">DimHeatmap</span>(<span class="dt">object =</span> pbmc, <span class="dt">reduction =</span> <span class="st">"pca"</span>, <span class="dt">cells =</span> <span class="dv">200</span>, <span class="dt">balanced =</span> <span class="ot">TRUE</span>)</a></code></pre></div> <p>ProjectPCA function is no loger available in Seurat 3.0.</p> </div> <div id="determine-statistically-significant-principal-components" class="section level2"> @@ -709,22 +709,22 @@ implemented a resampling test inspired by the jackStraw procedure. We randomly permute a subset of the data (1% by default) and rerun PCA, constructing a ‘null distribution’ of gene scores, and repeat this procedure. We identify ‘significant’ PCs as those who have a strong enrichment of low p-value genes.</p> -<div class="sourceCode" id="cb800"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb800-1" data-line-number="1"><span class="co"># </span><span class="al">NOTE</span><span class="co">: This process can take a long time for big datasets, comment out for</span></a> -<a class="sourceLine" id="cb800-2" data-line-number="2"><span class="co"># expediency. More approximate techniques such as those implemented in</span></a> -<a class="sourceLine" id="cb800-3" data-line-number="3"><span class="co"># PCElbowPlot() can be used to reduce computation time</span></a> -<a class="sourceLine" id="cb800-4" data-line-number="4">pbmc <-<span class="st"> </span><span class="kw">JackStraw</span>(<span class="dt">object =</span> pbmc, <span class="dt">reduction =</span> <span class="st">"pca"</span>, <span class="dt">dims =</span> <span class="dv">20</span>, <span class="dt">num.replicate =</span> <span class="dv">100</span>, <span class="dt">prop.freq =</span> <span class="fl">0.1</span>, <span class="dt">verbose =</span> <span class="ot">FALSE</span>)</a></code></pre></div> +<div class="sourceCode" id="cb883"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb883-1" data-line-number="1"><span class="co"># </span><span class="al">NOTE</span><span class="co">: This process can take a long time for big datasets, comment out for</span></a> +<a class="sourceLine" id="cb883-2" data-line-number="2"><span class="co"># expediency. More approximate techniques such as those implemented in</span></a> +<a class="sourceLine" id="cb883-3" data-line-number="3"><span class="co"># PCElbowPlot() can be used to reduce computation time</span></a> +<a class="sourceLine" id="cb883-4" data-line-number="4">pbmc <-<span class="st"> </span><span class="kw">JackStraw</span>(<span class="dt">object =</span> pbmc, <span class="dt">reduction =</span> <span class="st">"pca"</span>, <span class="dt">dims =</span> <span class="dv">20</span>, <span class="dt">num.replicate =</span> <span class="dv">100</span>, <span class="dt">prop.freq =</span> <span class="fl">0.1</span>, <span class="dt">verbose =</span> <span class="ot">FALSE</span>)</a></code></pre></div> <p>The <code>JackStrawPlot</code> function provides a visualization tool for comparing the distribution of p-values for each PC with a uniform distribution (dashed line). ‘Significant’ PCs will show a strong enrichment of genes with low p-values (solid curve above the dashed line). In this case it appears that PCs 1-10 are significant.</p> -<div class="sourceCode" id="cb801"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb801-1" data-line-number="1">pbmc <-<span class="st"> </span><span class="kw">ScoreJackStraw</span>(<span class="dt">object =</span> pbmc, <span class="dt">dims =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">20</span>, <span class="dt">reduction =</span> <span class="st">"pca"</span>)</a></code></pre></div> -<div class="sourceCode" id="cb802"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb802-1" data-line-number="1"><span class="kw">JackStrawPlot</span>(<span class="dt">object =</span> pbmc, <span class="dt">dims =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">20</span>, <span class="dt">reduction =</span> <span class="st">"pca"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb884"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb884-1" data-line-number="1">pbmc <-<span class="st"> </span><span class="kw">ScoreJackStraw</span>(<span class="dt">object =</span> pbmc, <span class="dt">dims =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">20</span>, <span class="dt">reduction =</span> <span class="st">"pca"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb885"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb885-1" data-line-number="1"><span class="kw">JackStrawPlot</span>(<span class="dt">object =</span> pbmc, <span class="dt">dims =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">20</span>, <span class="dt">reduction =</span> <span class="st">"pca"</span>)</a></code></pre></div> <p>A more ad hoc method for determining which PCs to use is to look at a plot of the standard deviations of the principle components and draw your cutoff where there is a clear elbow in the graph. This can be done with <code>ElbowPlot</code>. In this example, it looks like the elbow would fall around PC 5.</p> -<div class="sourceCode" id="cb803"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb803-1" data-line-number="1"><span class="kw">ElbowPlot</span>(<span class="dt">object =</span> pbmc)</a></code></pre></div> +<div class="sourceCode" id="cb886"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb886-1" data-line-number="1"><span class="kw">ElbowPlot</span>(<span class="dt">object =</span> pbmc)</a></code></pre></div> <p>PC selection – identifying the true dimensionality of a dataset – is an important step for Seurat, but can be challenging/uncertain for the user. We therefore suggest these three approaches to consider. The first is more @@ -772,8 +772,8 @@ datasets. Latest clustering results will be stored in object metadata under <code>seurat_clusters</code>.</p> <p>First calculate k-nearest neighbors and construct the SNN graph (<code>FindNeighbors</code>), then run <code>FindClusters</code>.</p> -<div class="sourceCode" id="cb804"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb804-1" data-line-number="1">pbmc <-<span class="st"> </span><span class="kw">FindNeighbors</span>(pbmc, <span class="dt">reduction =</span> <span class="st">"pca"</span>, <span class="dt">dims =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">20</span>)</a> -<a class="sourceLine" id="cb804-2" data-line-number="2">pbmc <-<span class="st"> </span><span class="kw">FindClusters</span>(pbmc, <span class="dt">resolution =</span> <span class="fl">0.5</span>, <span class="dt">algorithm =</span> <span class="dv">1</span>)</a></code></pre></div> +<div class="sourceCode" id="cb887"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb887-1" data-line-number="1">pbmc <-<span class="st"> </span><span class="kw">FindNeighbors</span>(pbmc, <span class="dt">reduction =</span> <span class="st">"pca"</span>, <span class="dt">dims =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">20</span>)</a> +<a class="sourceLine" id="cb887-2" data-line-number="2">pbmc <-<span class="st"> </span><span class="kw">FindClusters</span>(pbmc, <span class="dt">resolution =</span> <span class="fl">0.5</span>, <span class="dt">algorithm =</span> <span class="dv">1</span>)</a></code></pre></div> </div> <div id="run-non-linear-dimensional-reduction-tsne" class="section level2"> <h2><span class="header-section-number">16.10</span> Run Non-linear dimensional reduction (tSNE)</h2> @@ -785,20 +785,20 @@ neighborhoods in high-dimensional space together in low-dimensional space. As input to the tSNE, we suggest using the same PCs as input to the clustering analysis, although computing the tSNE based on scaled gene expression is also supported using the genes.use argument.</p> -<div class="sourceCode" id="cb805"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb805-1" data-line-number="1">pbmc <-<span class="st"> </span><span class="kw">RunTSNE</span>(<span class="dt">object =</span> pbmc, <span class="dt">dims.use =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">10</span>, <span class="dt">do.fast =</span> <span class="ot">TRUE</span>)</a> -<a class="sourceLine" id="cb805-2" data-line-number="2"><span class="co"># note that you can set do.label=T to help label individual clusters</span></a> -<a class="sourceLine" id="cb805-3" data-line-number="3"><span class="kw">DimPlot</span>(<span class="dt">object =</span> pbmc, <span class="dt">reduction =</span> <span class="st">"tsne"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb888"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb888-1" data-line-number="1">pbmc <-<span class="st"> </span><span class="kw">RunTSNE</span>(<span class="dt">object =</span> pbmc, <span class="dt">dims.use =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">10</span>, <span class="dt">do.fast =</span> <span class="ot">TRUE</span>)</a> +<a class="sourceLine" id="cb888-2" data-line-number="2"><span class="co"># note that you can set do.label=T to help label individual clusters</span></a> +<a class="sourceLine" id="cb888-3" data-line-number="3"><span class="kw">DimPlot</span>(<span class="dt">object =</span> pbmc, <span class="dt">reduction =</span> <span class="st">"tsne"</span>)</a></code></pre></div> </div> <div id="run-umap" class="section level2"> <h2><span class="header-section-number">16.11</span> Run UMAP</h2> <p>To visualize the two conditions side-by-side, we can use the split.by argument to show each condition colored by cluster.</p> -<div class="sourceCode" id="cb806"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb806-1" data-line-number="1">pbmc <-<span class="st"> </span><span class="kw">RunUMAP</span>(pbmc, <span class="dt">reduction =</span> <span class="st">"pca"</span>, <span class="dt">dims =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">20</span>)</a> -<a class="sourceLine" id="cb806-2" data-line-number="2"><span class="kw">DimPlot</span>(pbmc, <span class="dt">reduction =</span> <span class="st">"umap"</span>, <span class="dt">split.by =</span> <span class="st">"seurat_clusters"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb889"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb889-1" data-line-number="1">pbmc <-<span class="st"> </span><span class="kw">RunUMAP</span>(pbmc, <span class="dt">reduction =</span> <span class="st">"pca"</span>, <span class="dt">dims =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">20</span>)</a> +<a class="sourceLine" id="cb889-2" data-line-number="2"><span class="kw">DimPlot</span>(pbmc, <span class="dt">reduction =</span> <span class="st">"umap"</span>, <span class="dt">split.by =</span> <span class="st">"seurat_clusters"</span>)</a></code></pre></div> <p>You can save the object at this point so that it can easily be loaded back in without having to rerun the computationally intensive steps performed above, or easily shared with collaborators.</p> -<div class="sourceCode" id="cb807"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb807-1" data-line-number="1"><span class="kw">saveRDS</span>(pbmc, <span class="dt">file =</span> <span class="st">"data/pbmc_tutorial.rds"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb890"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb890-1" data-line-number="1"><span class="kw">saveRDS</span>(pbmc, <span class="dt">file =</span> <span class="st">"data/pbmc_tutorial.rds"</span>)</a></code></pre></div> </div> <div id="finding-differentially-expressed-genes-cluster-biomarkers" class="section level2"> <h2><span class="header-section-number">16.12</span> Finding differentially expressed genes (cluster biomarkers)</h2> @@ -817,22 +817,22 @@ discriminatory. As another option to speed up these computations, have no more cells than whatever this is set to. While there is generally going to be a loss in power, the speed increases can be significiant and the most highly differentially expressed genes will likely still rise to the top.</p> -<div class="sourceCode" id="cb808"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb808-1" data-line-number="1"><span class="co"># find all markers of cluster 1</span></a> -<a class="sourceLine" id="cb808-2" data-line-number="2">cluster1.markers <-<span class="st"> </span><span class="kw">FindMarkers</span>(<span class="dt">object =</span> pbmc, <span class="dt">ident.1 =</span> <span class="dv">1</span>, <span class="dt">min.pct =</span> <span class="fl">0.25</span>)</a> -<a class="sourceLine" id="cb808-3" data-line-number="3"><span class="kw">print</span>(<span class="dt">x =</span> <span class="kw">head</span>(<span class="dt">x =</span> cluster1.markers, <span class="dt">n =</span> <span class="dv">5</span>))</a></code></pre></div> -<div class="sourceCode" id="cb809"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb809-1" data-line-number="1"><span class="co"># find all markers distinguishing cluster 5 from clusters 0 and 3</span></a> -<a class="sourceLine" id="cb809-2" data-line-number="2">cluster5.markers <-<span class="st"> </span><span class="kw">FindMarkers</span>(<span class="dt">object =</span> pbmc, <span class="dt">ident.1 =</span> <span class="dv">2</span>, <span class="dt">ident.2 =</span> <span class="kw">c</span>(<span class="dv">0</span>, <span class="dv">3</span>), <span class="dt">min.pct =</span> <span class="fl">0.25</span>)</a> -<a class="sourceLine" id="cb809-3" data-line-number="3"><span class="kw">print</span>(<span class="dt">x =</span> <span class="kw">head</span>(<span class="dt">x =</span> cluster5.markers, <span class="dt">n =</span> <span class="dv">5</span>))</a></code></pre></div> -<div class="sourceCode" id="cb810"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb810-1" data-line-number="1"><span class="co"># find markers for every cluster compared to all remaining cells, report</span></a> -<a class="sourceLine" id="cb810-2" data-line-number="2"><span class="co"># only the positive ones</span></a> -<a class="sourceLine" id="cb810-3" data-line-number="3">pbmc.markers <-<span class="st"> </span><span class="kw">FindAllMarkers</span>(<span class="dt">object =</span> pbmc, <span class="dt">only.pos =</span> <span class="ot">TRUE</span>, <span class="dt">min.pct =</span> <span class="fl">0.25</span>, <span class="dt">thresh.use =</span> <span class="fl">0.25</span>)</a> -<a class="sourceLine" id="cb810-4" data-line-number="4">pbmc.markers <span class="op">%>%</span><span class="st"> </span><span class="kw">group_by</span>(cluster) <span class="op">%>%</span><span class="st"> </span><span class="kw">top_n</span>(<span class="dv">2</span>, avg_logFC)</a></code></pre></div> +<div class="sourceCode" id="cb891"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb891-1" data-line-number="1"><span class="co"># find all markers of cluster 1</span></a> +<a class="sourceLine" id="cb891-2" data-line-number="2">cluster1.markers <-<span class="st"> </span><span class="kw">FindMarkers</span>(<span class="dt">object =</span> pbmc, <span class="dt">ident.1 =</span> <span class="dv">1</span>, <span class="dt">min.pct =</span> <span class="fl">0.25</span>)</a> +<a class="sourceLine" id="cb891-3" data-line-number="3"><span class="kw">print</span>(<span class="dt">x =</span> <span class="kw">head</span>(<span class="dt">x =</span> cluster1.markers, <span class="dt">n =</span> <span class="dv">5</span>))</a></code></pre></div> +<div class="sourceCode" id="cb892"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb892-1" data-line-number="1"><span class="co"># find all markers distinguishing cluster 5 from clusters 0 and 3</span></a> +<a class="sourceLine" id="cb892-2" data-line-number="2">cluster5.markers <-<span class="st"> </span><span class="kw">FindMarkers</span>(<span class="dt">object =</span> pbmc, <span class="dt">ident.1 =</span> <span class="dv">2</span>, <span class="dt">ident.2 =</span> <span class="kw">c</span>(<span class="dv">0</span>, <span class="dv">3</span>), <span class="dt">min.pct =</span> <span class="fl">0.25</span>)</a> +<a class="sourceLine" id="cb892-3" data-line-number="3"><span class="kw">print</span>(<span class="dt">x =</span> <span class="kw">head</span>(<span class="dt">x =</span> cluster5.markers, <span class="dt">n =</span> <span class="dv">5</span>))</a></code></pre></div> +<div class="sourceCode" id="cb893"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb893-1" data-line-number="1"><span class="co"># find markers for every cluster compared to all remaining cells, report</span></a> +<a class="sourceLine" id="cb893-2" data-line-number="2"><span class="co"># only the positive ones</span></a> +<a class="sourceLine" id="cb893-3" data-line-number="3">pbmc.markers <-<span class="st"> </span><span class="kw">FindAllMarkers</span>(<span class="dt">object =</span> pbmc, <span class="dt">only.pos =</span> <span class="ot">TRUE</span>, <span class="dt">min.pct =</span> <span class="fl">0.25</span>, <span class="dt">thresh.use =</span> <span class="fl">0.25</span>)</a> +<a class="sourceLine" id="cb893-4" data-line-number="4">pbmc.markers <span class="op">%>%</span><span class="st"> </span><span class="kw">group_by</span>(cluster) <span class="op">%>%</span><span class="st"> </span><span class="kw">top_n</span>(<span class="dv">2</span>, avg_logFC)</a></code></pre></div> <p>Seurat has several tests for differential expression which can be set with the test.use parameter (see our <a href="http://satijalab.org/seurat/de_vignette.html">DE vignette</a> for details). For example, the ROC test returns the ‘classification power’ for any individual marker (ranging from 0 - random, to 1 - perfect).</p> -<div class="sourceCode" id="cb811"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb811-1" data-line-number="1">cluster1.markers <-<span class="st"> </span><span class="kw">FindMarkers</span>(<span class="dt">object =</span> pbmc, <span class="dt">ident.1 =</span> <span class="dv">0</span>, <span class="dt">thresh.use =</span> <span class="fl">0.25</span>, <span class="dt">test.use =</span> <span class="st">"roc"</span>, <span class="dt">only.pos =</span> <span class="ot">TRUE</span>)</a></code></pre></div> +<div class="sourceCode" id="cb894"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb894-1" data-line-number="1">cluster1.markers <-<span class="st"> </span><span class="kw">FindMarkers</span>(<span class="dt">object =</span> pbmc, <span class="dt">ident.1 =</span> <span class="dv">0</span>, <span class="dt">thresh.use =</span> <span class="fl">0.25</span>, <span class="dt">test.use =</span> <span class="st">"roc"</span>, <span class="dt">only.pos =</span> <span class="ot">TRUE</span>)</a></code></pre></div> <p>We include several tools for visualizing marker expression. • <code>VlnPlot</code> (shows expression probability distributions across clusters), • and <code>FeaturePlot</code> (visualizes gene expression on a tSNE or PCA plot) are our most commonly used visualizations.</p> @@ -840,24 +840,24 @@ marker (ranging from 0 - random, to 1 - perfect).</p> • <code>RidgePlot</code>, • <code>CellPlot</code>, and • <code>DotPlot</code> as additional methods to view your dataset.</p> -<div class="sourceCode" id="cb812"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb812-1" data-line-number="1"><span class="kw">VlnPlot</span>(<span class="dt">object =</span> pbmc, <span class="dt">features =</span><span class="kw">c</span>(<span class="st">"NKG7"</span>, <span class="st">"PF4"</span>))</a></code></pre></div> -<div class="sourceCode" id="cb813"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb813-1" data-line-number="1"><span class="kw">FeaturePlot</span>(<span class="dt">object =</span> pbmc, <span class="dt">features =</span> <span class="kw">c</span>(<span class="st">"MS4A1"</span>, <span class="st">"GNLY"</span>, <span class="st">"CD3E"</span>, <span class="st">"CD14"</span>, <span class="st">"FCER1A"</span>, <span class="st">"FCGR3A"</span>, <span class="st">"LYZ"</span>, <span class="st">"PPBP"</span>, <span class="st">"CD8A"</span>), <span class="dt">cols =</span> <span class="kw">c</span>(<span class="st">"grey"</span>, <span class="st">"blue"</span>), <span class="dt">reduction =</span> <span class="st">"tsne"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb895"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb895-1" data-line-number="1"><span class="kw">VlnPlot</span>(<span class="dt">object =</span> pbmc, <span class="dt">features =</span><span class="kw">c</span>(<span class="st">"NKG7"</span>, <span class="st">"PF4"</span>))</a></code></pre></div> +<div class="sourceCode" id="cb896"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb896-1" data-line-number="1"><span class="kw">FeaturePlot</span>(<span class="dt">object =</span> pbmc, <span class="dt">features =</span> <span class="kw">c</span>(<span class="st">"MS4A1"</span>, <span class="st">"GNLY"</span>, <span class="st">"CD3E"</span>, <span class="st">"CD14"</span>, <span class="st">"FCER1A"</span>, <span class="st">"FCGR3A"</span>, <span class="st">"LYZ"</span>, <span class="st">"PPBP"</span>, <span class="st">"CD8A"</span>), <span class="dt">cols =</span> <span class="kw">c</span>(<span class="st">"grey"</span>, <span class="st">"blue"</span>), <span class="dt">reduction =</span> <span class="st">"tsne"</span>)</a></code></pre></div> <p><code>DoHeatmap</code> generates an expression heatmap for given cells and genes. In this case, we are plotting the top 20 markers (or all markers if less than 20) for each cluster.</p> -<div class="sourceCode" id="cb814"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb814-1" data-line-number="1">top10 <-<span class="st"> </span>pbmc.markers <span class="op">%>%</span><span class="st"> </span><span class="kw">group_by</span>(cluster) <span class="op">%>%</span><span class="st"> </span><span class="kw">top_n</span>(<span class="dv">10</span>, avg_logFC)</a> -<a class="sourceLine" id="cb814-2" data-line-number="2"><span class="co"># setting slim.col.label to TRUE will print just the cluster IDS instead of</span></a> -<a class="sourceLine" id="cb814-3" data-line-number="3"><span class="co"># every cell name</span></a> -<a class="sourceLine" id="cb814-4" data-line-number="4"><span class="kw">DoHeatmap</span>(<span class="dt">object =</span> pbmc, <span class="dt">features =</span> top10<span class="op">$</span>gene, <span class="dt">label =</span> <span class="ot">TRUE</span>)</a></code></pre></div> +<div class="sourceCode" id="cb897"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb897-1" data-line-number="1">top10 <-<span class="st"> </span>pbmc.markers <span class="op">%>%</span><span class="st"> </span><span class="kw">group_by</span>(cluster) <span class="op">%>%</span><span class="st"> </span><span class="kw">top_n</span>(<span class="dv">10</span>, avg_logFC)</a> +<a class="sourceLine" id="cb897-2" data-line-number="2"><span class="co"># setting slim.col.label to TRUE will print just the cluster IDS instead of</span></a> +<a class="sourceLine" id="cb897-3" data-line-number="3"><span class="co"># every cell name</span></a> +<a class="sourceLine" id="cb897-4" data-line-number="4"><span class="kw">DoHeatmap</span>(<span class="dt">object =</span> pbmc, <span class="dt">features =</span> top10<span class="op">$</span>gene, <span class="dt">label =</span> <span class="ot">TRUE</span>)</a></code></pre></div> </div> <div id="assigning-cell-type-identity-to-clusters" class="section level2"> <h2><span class="header-section-number">16.13</span> Assigning cell type identity to clusters</h2> <p>Fortunately in the case of this dataset, we can use canonical markers to easily match the unbiased clustering to known cell types.</p> -<div class="sourceCode" id="cb815"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb815-1" data-line-number="1">current.cluster.ids <-<span class="st"> </span><span class="kw">c</span>(<span class="dv">0</span>, <span class="dv">1</span>, <span class="dv">2</span>, <span class="dv">3</span>, <span class="dv">4</span>, <span class="dv">5</span>, <span class="dv">6</span>, <span class="dv">7</span>)</a> -<a class="sourceLine" id="cb815-2" data-line-number="2">new.cluster.ids <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"CD4 T cells"</span>, <span class="st">"CD14+ Monocytes"</span>, <span class="st">"B cells"</span>, <span class="st">"CD8 T cells"</span>, <span class="st">"FCGR3A+ Monocytes"</span>, <span class="st">"NK cells"</span>, <span class="st">"Dendritic cells"</span>, <span class="st">"Megakaryocytes"</span>)</a> -<a class="sourceLine" id="cb815-3" data-line-number="3">pbmc<span class="op">@</span>active.ident <-<span class="st"> </span>plyr<span class="op">::</span><span class="kw">mapvalues</span>(<span class="dt">x =</span> pbmc<span class="op">@</span>active.ident, <span class="dt">from =</span> current.cluster.ids, <span class="dt">to =</span> new.cluster.ids)</a> -<a class="sourceLine" id="cb815-4" data-line-number="4"><span class="kw">DimPlot</span>(<span class="dt">object =</span> pbmc, <span class="dt">reduction =</span> <span class="st">"tsne"</span>, <span class="dt">do.label =</span> <span class="ot">TRUE</span>, <span class="dt">pt.size =</span> <span class="fl">0.5</span>)</a></code></pre></div> +<div class="sourceCode" id="cb898"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb898-1" data-line-number="1">current.cluster.ids <-<span class="st"> </span><span class="kw">c</span>(<span class="dv">0</span>, <span class="dv">1</span>, <span class="dv">2</span>, <span class="dv">3</span>, <span class="dv">4</span>, <span class="dv">5</span>, <span class="dv">6</span>, <span class="dv">7</span>)</a> +<a class="sourceLine" id="cb898-2" data-line-number="2">new.cluster.ids <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"CD4 T cells"</span>, <span class="st">"CD14+ Monocytes"</span>, <span class="st">"B cells"</span>, <span class="st">"CD8 T cells"</span>, <span class="st">"FCGR3A+ Monocytes"</span>, <span class="st">"NK cells"</span>, <span class="st">"Dendritic cells"</span>, <span class="st">"Megakaryocytes"</span>)</a> +<a class="sourceLine" id="cb898-3" data-line-number="3">pbmc<span class="op">@</span>active.ident <-<span class="st"> </span>plyr<span class="op">::</span><span class="kw">mapvalues</span>(<span class="dt">x =</span> pbmc<span class="op">@</span>active.ident, <span class="dt">from =</span> current.cluster.ids, <span class="dt">to =</span> new.cluster.ids)</a> +<a class="sourceLine" id="cb898-4" data-line-number="4"><span class="kw">DimPlot</span>(<span class="dt">object =</span> pbmc, <span class="dt">reduction =</span> <span class="st">"tsne"</span>, <span class="dt">do.label =</span> <span class="ot">TRUE</span>, <span class="dt">pt.size =</span> <span class="fl">0.5</span>)</a></code></pre></div> </div> <div id="further-subdivisions-within-cell-types" class="section level2"> <h2><span class="header-section-number">16.14</span> Further subdivisions within cell types</h2> @@ -867,30 +867,30 @@ subdivide into two groups. You can explore this subdivision to find markers separating the two T cell subsets. However, before reclustering (which will overwrite <code>object@ident</code>), we can stash our renamed identities to be easily recovered later.</p> -<div class="sourceCode" id="cb816"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb816-1" data-line-number="1"><span class="co"># First lets stash our identities for later</span></a> -<a class="sourceLine" id="cb816-2" data-line-number="2">pbmc <-<span class="st"> </span><span class="kw">StashIdent</span>(<span class="dt">object =</span> pbmc, <span class="dt">save.name =</span> <span class="st">"ClusterNames_0.6"</span>)</a> -<a class="sourceLine" id="cb816-3" data-line-number="3"></a> -<a class="sourceLine" id="cb816-4" data-line-number="4"><span class="co"># Note that if you set save.snn=T above, you don't need to recalculate the</span></a> -<a class="sourceLine" id="cb816-5" data-line-number="5"><span class="co"># SNN, and can simply put: pbmc <- FindClusters(pbmc,resolution = 0.8)</span></a> -<a class="sourceLine" id="cb816-6" data-line-number="6">pbmc <-<span class="st"> </span><span class="kw">FindClusters</span>(<span class="dt">object =</span> pbmc, <span class="dt">reduction.type =</span> <span class="st">"pca"</span>, <span class="dt">dims.use =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">10</span>, <span class="dt">resolution =</span> <span class="fl">0.8</span>, <span class="dt">print.output =</span> <span class="ot">FALSE</span>)</a></code></pre></div> -<div class="sourceCode" id="cb817"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb817-1" data-line-number="1"><span class="co"># Demonstration of how to plot two tSNE plots side by side, and how to color</span></a> -<a class="sourceLine" id="cb817-2" data-line-number="2"><span class="co"># points based on different criteria</span></a> -<a class="sourceLine" id="cb817-3" data-line-number="3">plot1 <-<span class="st"> </span><span class="kw">DimPlot</span>(<span class="dt">object =</span> pbmc, <span class="dt">reduction =</span> <span class="st">"tsne"</span>, <span class="dt">do.return =</span> <span class="ot">TRUE</span>, <span class="dt">no.legend =</span> <span class="ot">TRUE</span>, <span class="dt">do.label =</span> <span class="ot">TRUE</span>)</a> -<a class="sourceLine" id="cb817-4" data-line-number="4">plot2 <-<span class="st"> </span><span class="kw">DimPlot</span>(<span class="dt">object =</span> pbmc, <span class="dt">reduction =</span> <span class="st">"tsne"</span>, <span class="dt">do.return =</span> <span class="ot">TRUE</span>, <span class="dt">group.by =</span> <span class="st">"ClusterNames_0.6"</span>, <span class="dt">no.legend =</span> <span class="ot">TRUE</span>, <span class="dt">do.label =</span> <span class="ot">TRUE</span>)</a> -<a class="sourceLine" id="cb817-5" data-line-number="5"><span class="kw">plot_grid</span>(plot1, plot2)</a></code></pre></div> -<div class="sourceCode" id="cb818"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb818-1" data-line-number="1"><span class="co"># Find discriminating markers</span></a> -<a class="sourceLine" id="cb818-2" data-line-number="2">tcell.markers <-<span class="st"> </span><span class="kw">FindMarkers</span>(<span class="dt">object =</span> pbmc, <span class="dt">ident.1 =</span> <span class="dv">0</span>, <span class="dt">ident.2 =</span> <span class="dv">1</span>)</a> -<a class="sourceLine" id="cb818-3" data-line-number="3"></a> -<a class="sourceLine" id="cb818-4" data-line-number="4"><span class="co"># Most of the markers tend to be expressed in C1 (i.e. S100A4). However, we</span></a> -<a class="sourceLine" id="cb818-5" data-line-number="5"><span class="co"># can see that CCR7 is upregulated in C0, strongly indicating that we can</span></a> -<a class="sourceLine" id="cb818-6" data-line-number="6"><span class="co"># differentiate memory from naive CD4 cells. cols.use demarcates the color</span></a> -<a class="sourceLine" id="cb818-7" data-line-number="7"><span class="co"># palette from low to high expression</span></a> -<a class="sourceLine" id="cb818-8" data-line-number="8"><span class="kw">FeaturePlot</span>(<span class="dt">object =</span> pbmc, <span class="dt">features =</span> <span class="kw">c</span>(<span class="st">"S100A4"</span>, <span class="st">"CCR7"</span>), <span class="dt">cols =</span> <span class="kw">c</span>(<span class="st">"green"</span>, <span class="st">"blue"</span>))</a></code></pre></div> +<div class="sourceCode" id="cb899"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb899-1" data-line-number="1"><span class="co"># First lets stash our identities for later</span></a> +<a class="sourceLine" id="cb899-2" data-line-number="2">pbmc <-<span class="st"> </span><span class="kw">StashIdent</span>(<span class="dt">object =</span> pbmc, <span class="dt">save.name =</span> <span class="st">"ClusterNames_0.6"</span>)</a> +<a class="sourceLine" id="cb899-3" data-line-number="3"></a> +<a class="sourceLine" id="cb899-4" data-line-number="4"><span class="co"># Note that if you set save.snn=T above, you don't need to recalculate the</span></a> +<a class="sourceLine" id="cb899-5" data-line-number="5"><span class="co"># SNN, and can simply put: pbmc <- FindClusters(pbmc,resolution = 0.8)</span></a> +<a class="sourceLine" id="cb899-6" data-line-number="6">pbmc <-<span class="st"> </span><span class="kw">FindClusters</span>(<span class="dt">object =</span> pbmc, <span class="dt">reduction.type =</span> <span class="st">"pca"</span>, <span class="dt">dims.use =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">10</span>, <span class="dt">resolution =</span> <span class="fl">0.8</span>, <span class="dt">print.output =</span> <span class="ot">FALSE</span>)</a></code></pre></div> +<div class="sourceCode" id="cb900"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb900-1" data-line-number="1"><span class="co"># Demonstration of how to plot two tSNE plots side by side, and how to color</span></a> +<a class="sourceLine" id="cb900-2" data-line-number="2"><span class="co"># points based on different criteria</span></a> +<a class="sourceLine" id="cb900-3" data-line-number="3">plot1 <-<span class="st"> </span><span class="kw">DimPlot</span>(<span class="dt">object =</span> pbmc, <span class="dt">reduction =</span> <span class="st">"tsne"</span>, <span class="dt">do.return =</span> <span class="ot">TRUE</span>, <span class="dt">no.legend =</span> <span class="ot">TRUE</span>, <span class="dt">do.label =</span> <span class="ot">TRUE</span>)</a> +<a class="sourceLine" id="cb900-4" data-line-number="4">plot2 <-<span class="st"> </span><span class="kw">DimPlot</span>(<span class="dt">object =</span> pbmc, <span class="dt">reduction =</span> <span class="st">"tsne"</span>, <span class="dt">do.return =</span> <span class="ot">TRUE</span>, <span class="dt">group.by =</span> <span class="st">"ClusterNames_0.6"</span>, <span class="dt">no.legend =</span> <span class="ot">TRUE</span>, <span class="dt">do.label =</span> <span class="ot">TRUE</span>)</a> +<a class="sourceLine" id="cb900-5" data-line-number="5"><span class="kw">plot_grid</span>(plot1, plot2)</a></code></pre></div> +<div class="sourceCode" id="cb901"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb901-1" data-line-number="1"><span class="co"># Find discriminating markers</span></a> +<a class="sourceLine" id="cb901-2" data-line-number="2">tcell.markers <-<span class="st"> </span><span class="kw">FindMarkers</span>(<span class="dt">object =</span> pbmc, <span class="dt">ident.1 =</span> <span class="dv">0</span>, <span class="dt">ident.2 =</span> <span class="dv">1</span>)</a> +<a class="sourceLine" id="cb901-3" data-line-number="3"></a> +<a class="sourceLine" id="cb901-4" data-line-number="4"><span class="co"># Most of the markers tend to be expressed in C1 (i.e. S100A4). However, we</span></a> +<a class="sourceLine" id="cb901-5" data-line-number="5"><span class="co"># can see that CCR7 is upregulated in C0, strongly indicating that we can</span></a> +<a class="sourceLine" id="cb901-6" data-line-number="6"><span class="co"># differentiate memory from naive CD4 cells. cols.use demarcates the color</span></a> +<a class="sourceLine" id="cb901-7" data-line-number="7"><span class="co"># palette from low to high expression</span></a> +<a class="sourceLine" id="cb901-8" data-line-number="8"><span class="kw">FeaturePlot</span>(<span class="dt">object =</span> pbmc, <span class="dt">features =</span> <span class="kw">c</span>(<span class="st">"S100A4"</span>, <span class="st">"CCR7"</span>), <span class="dt">cols =</span> <span class="kw">c</span>(<span class="st">"green"</span>, <span class="st">"blue"</span>))</a></code></pre></div> <p>The memory/naive split is bit weak, and we would probably benefit from looking at more cells to see if this becomes more convincing. In the meantime, we can restore our old cluster identities for downstream processing.</p> -<div class="sourceCode" id="cb819"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb819-1" data-line-number="1">pbmc <-<span class="st"> </span><span class="kw">SetIdent</span>(<span class="dt">object =</span> pbmc, <span class="dt">value =</span> <span class="st">"ClusterNames_0.6"</span>)</a> -<a class="sourceLine" id="cb819-2" data-line-number="2"><span class="kw">saveRDS</span>(pbmc, <span class="dt">file =</span> <span class="st">"data/pbmc3k_final.rds"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb902"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb902-1" data-line-number="1">pbmc <-<span class="st"> </span><span class="kw">SetIdent</span>(<span class="dt">object =</span> pbmc, <span class="dt">value =</span> <span class="st">"ClusterNames_0.6"</span>)</a> +<a class="sourceLine" id="cb902-2" data-line-number="2"><span class="kw">saveRDS</span>(pbmc, <span class="dt">file =</span> <span class="st">"data/pbmc3k_final.rds"</span>)</a></code></pre></div> </div> <div id="sessioninfo-12" class="section level2"> <h2><span class="header-section-number">16.15</span> sessionInfo()</h2> diff --git a/public/trajectory-inference.html b/public/trajectory-inference.html index 1712f9f7b3799cf6ddcbe0e38895757ba7e87302..21b02c6bbed1e3c173bb7a65aaa5393ad2fb2b74 100644 --- a/public/trajectory-inference.html +++ b/public/trajectory-inference.html @@ -339,7 +339,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.4.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-3"><i class="fa fa-check"></i><b>7.4.6</b> sessionInfo()</a></li> </ul></li> <li class="chapter" data-level="7.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#identifying-confounding-factors-reads"><i class="fa fa-check"></i><b>7.5</b> Identifying confounding factors (Reads)</a></li> -<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#dealing-with-confounders"><i class="fa fa-check"></i><b>7.6</b> Dealing with confounders</a><ul> +<li class="chapter" data-level="7.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#batch-effects"><i class="fa fa-check"></i><b>7.6</b> Batch effects</a><ul> <li class="chapter" data-level="7.6.1" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#introduction-6"><i class="fa fa-check"></i><b>7.6.1</b> Introduction</a></li> <li class="chapter" data-level="7.6.2" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#linear-models"><i class="fa fa-check"></i><b>7.6.2</b> Linear models</a></li> <li class="chapter" data-level="7.6.3" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sctransform-2"><i class="fa fa-check"></i><b>7.6.3</b> sctransform</a></li> @@ -347,7 +347,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="7.6.5" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#combat"><i class="fa fa-check"></i><b>7.6.5</b> Combat</a></li> <li class="chapter" data-level="7.6.6" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#mnncorrect"><i class="fa fa-check"></i><b>7.6.6</b> mnnCorrect</a></li> <li class="chapter" data-level="7.6.7" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#harmony"><i class="fa fa-check"></i><b>7.6.7</b> Harmony</a></li> -<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-confounder-removal-strategies"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare confounder removal strategies</a></li> +<li class="chapter" data-level="7.6.8" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#how-to-evaluate-and-compare-batch-correction"><i class="fa fa-check"></i><b>7.6.8</b> How to evaluate and compare batch correction</a></li> <li class="chapter" data-level="7.6.9" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#big-exercise-2"><i class="fa fa-check"></i><b>7.6.9</b> Big Exercise</a></li> <li class="chapter" data-level="7.6.10" data-path="normalization-confounders-and-batch-correction.html"><a href="normalization-confounders-and-batch-correction.html#sessioninfo-4"><i class="fa fa-check"></i><b>7.6.10</b> sessionInfo()</a></li> </ul></li> @@ -370,14 +370,13 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="9.1.2" data-path="latent-spaces.html"><a href="latent-spaces.html#tsne-t-distributed-stochastic-neighbor-embedding"><i class="fa fa-check"></i><b>9.1.2</b> tSNE: t-Distributed Stochastic Neighbor Embedding</a></li> <li class="chapter" data-level="9.1.3" data-path="latent-spaces.html"><a href="latent-spaces.html#manifold-methods"><i class="fa fa-check"></i><b>9.1.3</b> Manifold methods</a></li> </ul></li> -<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a></li> +<li class="chapter" data-level="9.2" data-path="latent-spaces.html"><a href="latent-spaces.html#matrix-factorization-and-factor-analysis"><i class="fa fa-check"></i><b>9.2</b> Matrix factorization and factor analysis</a><ul> +<li class="chapter" data-level="9.2.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom-interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.2.1</b> <span>Slalom</span>: Interpretable latent spaces</a></li> +</ul></li> <li class="chapter" data-level="9.3" data-path="latent-spaces.html"><a href="latent-spaces.html#autoencoders"><i class="fa fa-check"></i><b>9.3</b> Autoencoders</a><ul> <li class="chapter" data-level="9.3.1" data-path="latent-spaces.html"><a href="latent-spaces.html#background-and-some-notations"><i class="fa fa-check"></i><b>9.3.1</b> Background and some notations</a></li> <li class="chapter" data-level="9.3.2" data-path="latent-spaces.html"><a href="latent-spaces.html#objective"><i class="fa fa-check"></i><b>9.3.2</b> Objective</a></li> </ul></li> -<li class="chapter" data-level="9.4" data-path="latent-spaces.html"><a href="latent-spaces.html#interpretable-latent-spaces"><i class="fa fa-check"></i><b>9.4</b> Interpretable latent spaces</a><ul> -<li class="chapter" data-level="9.4.1" data-path="latent-spaces.html"><a href="latent-spaces.html#slalom"><i class="fa fa-check"></i><b>9.4.1</b> Slalom</a></li> -</ul></li> </ul></li> <li class="chapter" data-level="10" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html"><i class="fa fa-check"></i><b>10</b> Clustering and cell annotation</a><ul> <li class="chapter" data-level="10.1" data-path="clustering-and-cell-annotation.html"><a href="clustering-and-cell-annotation.html#clustering-methods"><i class="fa fa-check"></i><b>10.1</b> Clustering Methods</a><ul> @@ -399,15 +398,16 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <li class="chapter" data-level="11.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#first-look-at-deng-data"><i class="fa fa-check"></i><b>11.1</b> First look at Deng data</a><ul> <li class="chapter" data-level="11.1.1" data-path="trajectory-inference.html"><a href="trajectory-inference.html#tscan"><i class="fa fa-check"></i><b>11.1.1</b> TSCAN</a></li> <li class="chapter" data-level="11.1.2" data-path="trajectory-inference.html"><a href="trajectory-inference.html#slingshot"><i class="fa fa-check"></i><b>11.1.2</b> Slingshot</a></li> -<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.3</b> Monocle</a></li> -<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.4</b> Monocle 2</a></li> -<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 3</a></li> -<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.6</b> Diffusion maps</a></li> -<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.7</b> Other methods</a></li> -<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.8</b> Comparison of the methods</a></li> -<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.9</b> Expression of genes through time</a></li> -<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.10</b> dynverse</a></li> -<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.11</b> sessionInfo()</a></li> +<li class="chapter" data-level="11.1.3" data-path="trajectory-inference.html"><a href="trajectory-inference.html#gam-general-additive-model-for-identifying-temporally-expressed-genes"><i class="fa fa-check"></i><b>11.1.3</b> GAM general additive model for identifying temporally expressed genes</a></li> +<li class="chapter" data-level="11.1.4" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle"><i class="fa fa-check"></i><b>11.1.4</b> Monocle</a></li> +<li class="chapter" data-level="11.1.5" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-2"><i class="fa fa-check"></i><b>11.1.5</b> Monocle 2</a></li> +<li class="chapter" data-level="11.1.6" data-path="trajectory-inference.html"><a href="trajectory-inference.html#monocle-3"><i class="fa fa-check"></i><b>11.1.6</b> Monocle 3</a></li> +<li class="chapter" data-level="11.1.7" data-path="trajectory-inference.html"><a href="trajectory-inference.html#diffusion-maps"><i class="fa fa-check"></i><b>11.1.7</b> Diffusion maps</a></li> +<li class="chapter" data-level="11.1.8" data-path="trajectory-inference.html"><a href="trajectory-inference.html#other-methods"><i class="fa fa-check"></i><b>11.1.8</b> Other methods</a></li> +<li class="chapter" data-level="11.1.9" data-path="trajectory-inference.html"><a href="trajectory-inference.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>11.1.9</b> Comparison of the methods</a></li> +<li class="chapter" data-level="11.1.10" data-path="trajectory-inference.html"><a href="trajectory-inference.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>11.1.10</b> Expression of genes through time</a></li> +<li class="chapter" data-level="11.1.11" data-path="trajectory-inference.html"><a href="trajectory-inference.html#dynverse"><i class="fa fa-check"></i><b>11.1.11</b> dynverse</a></li> +<li class="chapter" data-level="11.1.12" data-path="trajectory-inference.html"><a href="trajectory-inference.html#sessioninfo-7"><i class="fa fa-check"></i><b>11.1.12</b> sessionInfo()</a></li> </ul></li> </ul></li> <li class="chapter" data-level="12" data-path="dechapter.html"><a href="dechapter.html"><i class="fa fa-check"></i><b>12</b> Differential Expression (DE) analysis</a><ul> @@ -506,21 +506,21 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warni <section class="normal" id="section-"> <div id="trajectory-inference" class="section level1"> <h1><span class="header-section-number">11</span> Trajectory inference</h1> -<div class="sourceCode" id="cb641"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb641-1" data-line-number="1"><span class="kw">library</span>(SingleCellExperiment)</a> -<a class="sourceLine" id="cb641-2" data-line-number="2"><span class="kw">library</span>(TSCAN)</a> -<a class="sourceLine" id="cb641-3" data-line-number="3"><span class="kw">library</span>(M3Drop)</a> -<a class="sourceLine" id="cb641-4" data-line-number="4"><span class="kw">library</span>(monocle)</a> -<a class="sourceLine" id="cb641-5" data-line-number="5"><span class="kw">library</span>(destiny)</a> -<a class="sourceLine" id="cb641-6" data-line-number="6"><span class="kw">library</span>(scater)</a> -<a class="sourceLine" id="cb641-7" data-line-number="7"><span class="kw">library</span>(ggplot2)</a> -<a class="sourceLine" id="cb641-8" data-line-number="8"><span class="kw">library</span>(ggthemes)</a> -<a class="sourceLine" id="cb641-9" data-line-number="9"><span class="kw">library</span>(ggbeeswarm)</a> -<a class="sourceLine" id="cb641-10" data-line-number="10"><span class="kw">library</span>(corrplot)</a> -<a class="sourceLine" id="cb641-11" data-line-number="11"><span class="kw">library</span>(Polychrome)</a> -<a class="sourceLine" id="cb641-12" data-line-number="12"><span class="kw">library</span>(slingshot)</a> -<a class="sourceLine" id="cb641-13" data-line-number="13"><span class="kw">library</span>(SLICER)</a> -<a class="sourceLine" id="cb641-14" data-line-number="14"><span class="kw">library</span>(ouija)</a> -<a class="sourceLine" id="cb641-15" data-line-number="15"><span class="kw">set.seed</span>(<span class="dv">1</span>)</a></code></pre></div> +<div class="sourceCode" id="cb723"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb723-1" data-line-number="1"><span class="kw">library</span>(SingleCellExperiment)</a> +<a class="sourceLine" id="cb723-2" data-line-number="2"><span class="kw">library</span>(TSCAN)</a> +<a class="sourceLine" id="cb723-3" data-line-number="3"><span class="kw">library</span>(M3Drop)</a> +<a class="sourceLine" id="cb723-4" data-line-number="4"><span class="kw">library</span>(monocle)</a> +<a class="sourceLine" id="cb723-5" data-line-number="5"><span class="kw">library</span>(destiny)</a> +<a class="sourceLine" id="cb723-6" data-line-number="6"><span class="kw">library</span>(scater)</a> +<a class="sourceLine" id="cb723-7" data-line-number="7"><span class="kw">library</span>(ggplot2)</a> +<a class="sourceLine" id="cb723-8" data-line-number="8"><span class="kw">library</span>(ggthemes)</a> +<a class="sourceLine" id="cb723-9" data-line-number="9"><span class="kw">library</span>(ggbeeswarm)</a> +<a class="sourceLine" id="cb723-10" data-line-number="10"><span class="kw">library</span>(corrplot)</a> +<a class="sourceLine" id="cb723-11" data-line-number="11"><span class="kw">library</span>(Polychrome)</a> +<a class="sourceLine" id="cb723-12" data-line-number="12"><span class="kw">library</span>(slingshot)</a> +<a class="sourceLine" id="cb723-13" data-line-number="13"><span class="kw">library</span>(SLICER)</a> +<a class="sourceLine" id="cb723-14" data-line-number="14"><span class="kw">library</span>(ouija)</a> +<a class="sourceLine" id="cb723-15" data-line-number="15"><span class="kw">set.seed</span>(<span class="dv">1</span>)</a></code></pre></div> <p>In many situations, one is studying a process where cells change continuously. This includes, for example, many differentiation processes taking place during development: following a stimulus, cells @@ -572,42 +572,42 @@ Figure 2.5: Detailed results of the four main evaluation criteria: accuracy, sca <div id="first-look-at-deng-data" class="section level2"> <h2><span class="header-section-number">11.1</span> First look at Deng data</h2> <p>Let us take a first look at the Deng<span class="citation">(Deng et al. <a href="#ref-Deng2014-mx">2014</a>)</span> data, without yet applying sophisticated pseudotime methods. As the plot below shows, simple PCA does a very good job of displaying the structure in these data. It is only once we reach the blast cell types (“earlyblastâ€, “midblastâ€, “lateblastâ€) that PCA struggles to separate the distinct cell types.</p> -<div class="sourceCode" id="cb642"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb642-1" data-line-number="1">deng_SCE <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/deng/deng-reads.rds"</span>)</a> -<a class="sourceLine" id="cb642-2" data-line-number="2"></a> -<a class="sourceLine" id="cb642-3" data-line-number="3">deng_SCE<span class="op">$</span>cell_type2 <-<span class="st"> </span><span class="kw">factor</span>(</a> -<a class="sourceLine" id="cb642-4" data-line-number="4"> deng_SCE<span class="op">$</span>cell_type2,</a> -<a class="sourceLine" id="cb642-5" data-line-number="5"> <span class="dt">levels =</span> <span class="kw">c</span>(<span class="st">"zy"</span>, <span class="st">"early2cell"</span>, <span class="st">"mid2cell"</span>, <span class="st">"late2cell"</span>,</a> -<a class="sourceLine" id="cb642-6" data-line-number="6"> <span class="st">"4cell"</span>, <span class="st">"8cell"</span>, <span class="st">"16cell"</span>, <span class="st">"earlyblast"</span>,</a> -<a class="sourceLine" id="cb642-7" data-line-number="7"> <span class="st">"midblast"</span>, <span class="st">"lateblast"</span>)</a> -<a class="sourceLine" id="cb642-8" data-line-number="8">)</a> -<a class="sourceLine" id="cb642-9" data-line-number="9">cellLabels <-<span class="st"> </span>deng_SCE<span class="op">$</span>cell_type2</a> -<a class="sourceLine" id="cb642-10" data-line-number="10">deng <-<span class="st"> </span><span class="kw">counts</span>(deng_SCE)</a> -<a class="sourceLine" id="cb642-11" data-line-number="11"><span class="kw">colnames</span>(deng) <-<span class="st"> </span>cellLabels</a> -<a class="sourceLine" id="cb642-12" data-line-number="12"></a> -<a class="sourceLine" id="cb642-13" data-line-number="13">deng_SCE <-<span class="st"> </span>scater<span class="op">::</span><span class="kw">runPCA</span>(deng_SCE,<span class="dt">ncomponent =</span> <span class="dv">5</span>)</a> -<a class="sourceLine" id="cb642-14" data-line-number="14"></a> -<a class="sourceLine" id="cb642-15" data-line-number="15"><span class="co">## change color Palette with library(Polychrome)</span></a> -<a class="sourceLine" id="cb642-16" data-line-number="16"></a> -<a class="sourceLine" id="cb642-17" data-line-number="17"><span class="kw">set.seed</span>(<span class="dv">723451</span>) <span class="co"># for reproducibility</span></a> -<a class="sourceLine" id="cb642-18" data-line-number="18">my_color <-<span class="st"> </span><span class="kw">createPalette</span>(<span class="dv">10</span>, <span class="kw">c</span>(<span class="st">"#010101"</span>, <span class="st">"#ff0000"</span>), <span class="dt">M=</span><span class="dv">1000</span>)</a> -<a class="sourceLine" id="cb642-19" data-line-number="19"><span class="kw">names</span>(my_color) <-<span class="st"> </span><span class="kw">unique</span>(<span class="kw">as.character</span>(deng_SCE<span class="op">$</span>cell_type2))</a> -<a class="sourceLine" id="cb642-20" data-line-number="20"></a> -<a class="sourceLine" id="cb642-21" data-line-number="21">pca_df <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="dt">PC1 =</span> <span class="kw">reducedDim</span>(deng_SCE,<span class="st">"PCA"</span>)[,<span class="dv">1</span>],</a> -<a class="sourceLine" id="cb642-22" data-line-number="22"> <span class="dt">PC2 =</span> <span class="kw">reducedDim</span>(deng_SCE,<span class="st">"PCA"</span>)[,<span class="dv">2</span>],</a> -<a class="sourceLine" id="cb642-23" data-line-number="23"> <span class="dt">cell_type2 =</span> deng_SCE<span class="op">$</span>cell_type2)</a> -<a class="sourceLine" id="cb642-24" data-line-number="24"></a> -<a class="sourceLine" id="cb642-25" data-line-number="25"><span class="kw">ggplot</span>(<span class="dt">data =</span> pca_df)<span class="op">+</span><span class="kw">geom_point</span>(<span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">x =</span> PC1, <span class="dt">y =</span> PC2, <span class="dt">colour =</span> cell_type2))<span class="op">+</span></a> -<a class="sourceLine" id="cb642-26" data-line-number="26"><span class="st"> </span><span class="kw">scale_colour_manual</span>(<span class="dt">values =</span> my_color)<span class="op">+</span><span class="kw">theme_classic</span>()</a></code></pre></div> +<div class="sourceCode" id="cb724"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb724-1" data-line-number="1">deng_SCE <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"data/deng/deng-reads.rds"</span>)</a> +<a class="sourceLine" id="cb724-2" data-line-number="2"></a> +<a class="sourceLine" id="cb724-3" data-line-number="3">deng_SCE<span class="op">$</span>cell_type2 <-<span class="st"> </span><span class="kw">factor</span>(</a> +<a class="sourceLine" id="cb724-4" data-line-number="4"> deng_SCE<span class="op">$</span>cell_type2,</a> +<a class="sourceLine" id="cb724-5" data-line-number="5"> <span class="dt">levels =</span> <span class="kw">c</span>(<span class="st">"zy"</span>, <span class="st">"early2cell"</span>, <span class="st">"mid2cell"</span>, <span class="st">"late2cell"</span>,</a> +<a class="sourceLine" id="cb724-6" data-line-number="6"> <span class="st">"4cell"</span>, <span class="st">"8cell"</span>, <span class="st">"16cell"</span>, <span class="st">"earlyblast"</span>,</a> +<a class="sourceLine" id="cb724-7" data-line-number="7"> <span class="st">"midblast"</span>, <span class="st">"lateblast"</span>)</a> +<a class="sourceLine" id="cb724-8" data-line-number="8">)</a> +<a class="sourceLine" id="cb724-9" data-line-number="9">cellLabels <-<span class="st"> </span>deng_SCE<span class="op">$</span>cell_type2</a> +<a class="sourceLine" id="cb724-10" data-line-number="10">deng <-<span class="st"> </span><span class="kw">counts</span>(deng_SCE)</a> +<a class="sourceLine" id="cb724-11" data-line-number="11"><span class="kw">colnames</span>(deng) <-<span class="st"> </span>cellLabels</a> +<a class="sourceLine" id="cb724-12" data-line-number="12"></a> +<a class="sourceLine" id="cb724-13" data-line-number="13">deng_SCE <-<span class="st"> </span>scater<span class="op">::</span><span class="kw">runPCA</span>(deng_SCE,<span class="dt">ncomponent =</span> <span class="dv">5</span>)</a> +<a class="sourceLine" id="cb724-14" data-line-number="14"></a> +<a class="sourceLine" id="cb724-15" data-line-number="15"><span class="co">## change color Palette with library(Polychrome)</span></a> +<a class="sourceLine" id="cb724-16" data-line-number="16"></a> +<a class="sourceLine" id="cb724-17" data-line-number="17"><span class="kw">set.seed</span>(<span class="dv">723451</span>) <span class="co"># for reproducibility</span></a> +<a class="sourceLine" id="cb724-18" data-line-number="18">my_color <-<span class="st"> </span><span class="kw">createPalette</span>(<span class="dv">10</span>, <span class="kw">c</span>(<span class="st">"#010101"</span>, <span class="st">"#ff0000"</span>), <span class="dt">M=</span><span class="dv">1000</span>)</a> +<a class="sourceLine" id="cb724-19" data-line-number="19"><span class="kw">names</span>(my_color) <-<span class="st"> </span><span class="kw">unique</span>(<span class="kw">as.character</span>(deng_SCE<span class="op">$</span>cell_type2))</a> +<a class="sourceLine" id="cb724-20" data-line-number="20"></a> +<a class="sourceLine" id="cb724-21" data-line-number="21">pca_df <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="dt">PC1 =</span> <span class="kw">reducedDim</span>(deng_SCE,<span class="st">"PCA"</span>)[,<span class="dv">1</span>],</a> +<a class="sourceLine" id="cb724-22" data-line-number="22"> <span class="dt">PC2 =</span> <span class="kw">reducedDim</span>(deng_SCE,<span class="st">"PCA"</span>)[,<span class="dv">2</span>],</a> +<a class="sourceLine" id="cb724-23" data-line-number="23"> <span class="dt">cell_type2 =</span> deng_SCE<span class="op">$</span>cell_type2)</a> +<a class="sourceLine" id="cb724-24" data-line-number="24"></a> +<a class="sourceLine" id="cb724-25" data-line-number="25"><span class="kw">ggplot</span>(<span class="dt">data =</span> pca_df)<span class="op">+</span><span class="kw">geom_point</span>(<span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">x =</span> PC1, <span class="dt">y =</span> PC2, <span class="dt">colour =</span> cell_type2))<span class="op">+</span></a> +<a class="sourceLine" id="cb724-26" data-line-number="26"><span class="st"> </span><span class="kw">scale_colour_manual</span>(<span class="dt">values =</span> my_color)<span class="op">+</span><span class="kw">theme_classic</span>()</a></code></pre></div> <p><img src="pseudotime_files/figure-html/data-overview-1.png" width="90%" style="display: block; margin: auto;" /></p> <p>PCA, here, provides a useful baseline for assessing different pseudotime methods. For a very naive pseudotime we can just take the co-ordinates of the first principal component.</p> -<div class="sourceCode" id="cb643"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb643-1" data-line-number="1"><span class="co">#deng_SCE$PC1 <- reducedDim(deng_SCE, "PCA")[,1]</span></a> -<a class="sourceLine" id="cb643-2" data-line-number="2"></a> -<a class="sourceLine" id="cb643-3" data-line-number="3"><span class="kw">ggplot</span>(pca_df, <span class="kw">aes</span>(<span class="dt">x =</span> PC1, <span class="dt">y =</span> cell_type2, </a> -<a class="sourceLine" id="cb643-4" data-line-number="4"> <span class="dt">colour =</span> cell_type2)) <span class="op">+</span></a> -<a class="sourceLine" id="cb643-5" data-line-number="5"><span class="st"> </span><span class="kw">geom_quasirandom</span>(<span class="dt">groupOnX =</span> <span class="ot">FALSE</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb643-6" data-line-number="6"><span class="st"> </span><span class="kw">scale_colour_manual</span>(<span class="dt">values =</span> my_color) <span class="op">+</span><span class="st"> </span><span class="kw">theme_classic</span>() <span class="op">+</span></a> -<a class="sourceLine" id="cb643-7" data-line-number="7"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"First principal component"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Timepoint"</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb643-8" data-line-number="8"><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"Cells ordered by first principal component"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb725"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb725-1" data-line-number="1"><span class="co">#deng_SCE$PC1 <- reducedDim(deng_SCE, "PCA")[,1]</span></a> +<a class="sourceLine" id="cb725-2" data-line-number="2"></a> +<a class="sourceLine" id="cb725-3" data-line-number="3"><span class="kw">ggplot</span>(pca_df, <span class="kw">aes</span>(<span class="dt">x =</span> PC1, <span class="dt">y =</span> cell_type2, </a> +<a class="sourceLine" id="cb725-4" data-line-number="4"> <span class="dt">colour =</span> cell_type2)) <span class="op">+</span></a> +<a class="sourceLine" id="cb725-5" data-line-number="5"><span class="st"> </span><span class="kw">geom_quasirandom</span>(<span class="dt">groupOnX =</span> <span class="ot">FALSE</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb725-6" data-line-number="6"><span class="st"> </span><span class="kw">scale_colour_manual</span>(<span class="dt">values =</span> my_color) <span class="op">+</span><span class="st"> </span><span class="kw">theme_classic</span>() <span class="op">+</span></a> +<a class="sourceLine" id="cb725-7" data-line-number="7"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"First principal component"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Timepoint"</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb725-8" data-line-number="8"><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"Cells ordered by first principal component"</span>)</a></code></pre></div> <p><img src="pseudotime_files/figure-html/pca-pseudotime-1.png" width="90%" style="display: block; margin: auto;" /></p> <p>As the plot above shows, PC1 struggles to correctly order cells early and late in the developmental timecourse, but overall does a relatively good job of ordering cells by developmental time.</p> <p>Can bespoke pseudotime methods do better than naive application of PCA?</p> @@ -616,32 +616,32 @@ Figure 2.5: Detailed results of the four main evaluation criteria: accuracy, sca <p>TSCAN <span class="citation">(Ji and Ji <a href="#ref-tscam_rpkg">2019</a>)</span> combines clustering with pseudotime analysis. First it clusters the cells using <code>mclust</code>, which is based on a mixture of normal distributions. Then it builds a minimum spanning tree to connect the clusters. The branch of this tree that connects the largest number of clusters is the main branch which is used to determine pseudotime.</p> <p><strong>Note</strong> From a connected graph with weighted edges, MST is the tree structure that connects all the nodes in a way that has the minimum total edge weight. The trajectory inference methods that use MST is based on the idea that nodes (cells/clusters of cells) and their connections represent the geometric shape of the data cloud in a two-dimenension space.</p> <p>First we will try to use all genes to order the cells.</p> -<div class="sourceCode" id="cb644"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb644-1" data-line-number="1">procdeng <-<span class="st"> </span>TSCAN<span class="op">::</span><span class="kw">preprocess</span>(<span class="kw">counts</span>(deng_SCE))</a> -<a class="sourceLine" id="cb644-2" data-line-number="2"></a> -<a class="sourceLine" id="cb644-3" data-line-number="3"><span class="kw">colnames</span>(procdeng) <-<span class="st"> </span><span class="dv">1</span><span class="op">:</span><span class="kw">ncol</span>(deng_SCE)</a> -<a class="sourceLine" id="cb644-4" data-line-number="4"></a> -<a class="sourceLine" id="cb644-5" data-line-number="5">dengclust <-<span class="st"> </span>TSCAN<span class="op">::</span><span class="kw">exprmclust</span>(procdeng, <span class="dt">clusternum =</span> <span class="dv">10</span>)</a> -<a class="sourceLine" id="cb644-6" data-line-number="6"></a> -<a class="sourceLine" id="cb644-7" data-line-number="7">TSCAN<span class="op">::</span><span class="kw">plotmclust</span>(dengclust)</a></code></pre></div> +<div class="sourceCode" id="cb726"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb726-1" data-line-number="1">procdeng <-<span class="st"> </span>TSCAN<span class="op">::</span><span class="kw">preprocess</span>(<span class="kw">counts</span>(deng_SCE))</a> +<a class="sourceLine" id="cb726-2" data-line-number="2"></a> +<a class="sourceLine" id="cb726-3" data-line-number="3"><span class="kw">colnames</span>(procdeng) <-<span class="st"> </span><span class="dv">1</span><span class="op">:</span><span class="kw">ncol</span>(deng_SCE)</a> +<a class="sourceLine" id="cb726-4" data-line-number="4"></a> +<a class="sourceLine" id="cb726-5" data-line-number="5">dengclust <-<span class="st"> </span>TSCAN<span class="op">::</span><span class="kw">exprmclust</span>(procdeng, <span class="dt">clusternum =</span> <span class="dv">10</span>)</a> +<a class="sourceLine" id="cb726-6" data-line-number="6"></a> +<a class="sourceLine" id="cb726-7" data-line-number="7">TSCAN<span class="op">::</span><span class="kw">plotmclust</span>(dengclust)</a></code></pre></div> <p><img src="pseudotime_files/figure-html/tscan-all-genes-1.png" width="90%" style="display: block; margin: auto;" /></p> -<div class="sourceCode" id="cb645"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb645-1" data-line-number="1">dengorderTSCAN <-<span class="st"> </span>TSCAN<span class="op">::</span><span class="kw">TSCANorder</span>(dengclust, <span class="dt">orderonly =</span> <span class="ot">FALSE</span>)</a> -<a class="sourceLine" id="cb645-2" data-line-number="2">pseudotime_order_tscan <-<span class="st"> </span><span class="kw">as.character</span>(dengorderTSCAN<span class="op">$</span>sample_name)</a> -<a class="sourceLine" id="cb645-3" data-line-number="3">deng_SCE<span class="op">$</span>pseudotime_order_tscan <-<span class="st"> </span><span class="ot">NA</span></a> -<a class="sourceLine" id="cb645-4" data-line-number="4">deng_SCE<span class="op">$</span>pseudotime_order_tscan[<span class="kw">as.numeric</span>(dengorderTSCAN<span class="op">$</span>sample_name)] <-<span class="st"> </span></a> -<a class="sourceLine" id="cb645-5" data-line-number="5"><span class="st"> </span>dengorderTSCAN<span class="op">$</span>Pseudotime</a></code></pre></div> +<div class="sourceCode" id="cb727"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb727-1" data-line-number="1">dengorderTSCAN <-<span class="st"> </span>TSCAN<span class="op">::</span><span class="kw">TSCANorder</span>(dengclust, <span class="dt">orderonly =</span> <span class="ot">FALSE</span>)</a> +<a class="sourceLine" id="cb727-2" data-line-number="2">pseudotime_order_tscan <-<span class="st"> </span><span class="kw">as.character</span>(dengorderTSCAN<span class="op">$</span>sample_name)</a> +<a class="sourceLine" id="cb727-3" data-line-number="3">deng_SCE<span class="op">$</span>pseudotime_order_tscan <-<span class="st"> </span><span class="ot">NA</span></a> +<a class="sourceLine" id="cb727-4" data-line-number="4">deng_SCE<span class="op">$</span>pseudotime_order_tscan[<span class="kw">as.numeric</span>(dengorderTSCAN<span class="op">$</span>sample_name)] <-<span class="st"> </span></a> +<a class="sourceLine" id="cb727-5" data-line-number="5"><span class="st"> </span>dengorderTSCAN<span class="op">$</span>Pseudotime</a></code></pre></div> <p>Frustratingly, TSCAN only provides pseudotime values for 221 of 268 cells, silently returning missing values for non-assigned cells.</p> <p>Again, we examine which timepoints have been assigned to each state:</p> -<div class="sourceCode" id="cb646"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb646-1" data-line-number="1">cellLabels[dengclust<span class="op">$</span>clusterid <span class="op">==</span><span class="st"> </span><span class="dv">10</span>]</a></code></pre></div> +<div class="sourceCode" id="cb728"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb728-1" data-line-number="1">cellLabels[dengclust<span class="op">$</span>clusterid <span class="op">==</span><span class="st"> </span><span class="dv">10</span>]</a></code></pre></div> <pre><code>## [1] late2cell late2cell late2cell late2cell late2cell late2cell late2cell ## [8] late2cell late2cell late2cell ## 10 Levels: zy early2cell mid2cell late2cell 4cell 8cell ... lateblast</code></pre> -<div class="sourceCode" id="cb648"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb648-1" data-line-number="1"><span class="kw">ggplot</span>(<span class="kw">as.data.frame</span>(<span class="kw">colData</span>(deng_SCE)), </a> -<a class="sourceLine" id="cb648-2" data-line-number="2"> <span class="kw">aes</span>(<span class="dt">x =</span> pseudotime_order_tscan, </a> -<a class="sourceLine" id="cb648-3" data-line-number="3"> <span class="dt">y =</span> cell_type2, <span class="dt">colour =</span> cell_type2)) <span class="op">+</span></a> -<a class="sourceLine" id="cb648-4" data-line-number="4"><span class="st"> </span><span class="kw">geom_quasirandom</span>(<span class="dt">groupOnX =</span> <span class="ot">FALSE</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb648-5" data-line-number="5"><span class="st"> </span><span class="kw">scale_color_manual</span>(<span class="dt">values =</span> my_color) <span class="op">+</span><span class="st"> </span><span class="kw">theme_classic</span>() <span class="op">+</span></a> -<a class="sourceLine" id="cb648-6" data-line-number="6"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"TSCAN pseudotime"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Timepoint"</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb648-7" data-line-number="7"><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"Cells ordered by TSCAN pseudotime"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb730"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb730-1" data-line-number="1"><span class="kw">ggplot</span>(<span class="kw">as.data.frame</span>(<span class="kw">colData</span>(deng_SCE)), </a> +<a class="sourceLine" id="cb730-2" data-line-number="2"> <span class="kw">aes</span>(<span class="dt">x =</span> pseudotime_order_tscan, </a> +<a class="sourceLine" id="cb730-3" data-line-number="3"> <span class="dt">y =</span> cell_type2, <span class="dt">colour =</span> cell_type2)) <span class="op">+</span></a> +<a class="sourceLine" id="cb730-4" data-line-number="4"><span class="st"> </span><span class="kw">geom_quasirandom</span>(<span class="dt">groupOnX =</span> <span class="ot">FALSE</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb730-5" data-line-number="5"><span class="st"> </span><span class="kw">scale_color_manual</span>(<span class="dt">values =</span> my_color) <span class="op">+</span><span class="st"> </span><span class="kw">theme_classic</span>() <span class="op">+</span></a> +<a class="sourceLine" id="cb730-6" data-line-number="6"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"TSCAN pseudotime"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Timepoint"</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb730-7" data-line-number="7"><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"Cells ordered by TSCAN pseudotime"</span>)</a></code></pre></div> <p><img src="pseudotime_files/figure-html/tscan-vs-truth-1.png" width="90%" style="display: block; margin: auto;" /></p> <p>TSCAN gets the development trajectory the “wrong way aroundâ€, in the sense that later pseudotime values correspond to early timepoints and vice versa. This is not inherently a problem (it is easy enough to reverse the ordering to get the intuitive interpretation of pseudotime), but overall it would be a stretch to suggest that TSCAN performs better than PCA on this dataset. (As it is a PCA-based method, perhaps this is not entirely surprising.)</p> <p><strong>Exercise 1</strong> Compare results for different numbers of clusters (<code>clusternum</code>).</p> @@ -653,19 +653,19 @@ Figure 2.5: Detailed results of the four main evaluation criteria: accuracy, sca <p><code>Slingshot</code> had consistently performing well across different datasets as reported by Saelens et al, let’s have a run for the <code>deng</code> dataset. It is recommended by <code>Slingshot</code> to run in a reduced dimensions.</p> <p>__Note_ Principal curves are smooth one-dimensional curves that pass through the middle of a p-dimensional data set, providing a nonlinear summary of the data. They are nonparametric, and their shape is suggested by the data (Hastie et al)<span class="citation">(Hastie and Stuetzle <a href="#ref-Hastie1989-pd">1989</a>)</span>.</p> -<div class="sourceCode" id="cb649"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb649-1" data-line-number="1"><span class="co">## runing slingshot</span></a> -<a class="sourceLine" id="cb649-2" data-line-number="2"></a> -<a class="sourceLine" id="cb649-3" data-line-number="3">deng_SCE <-<span class="st"> </span><span class="kw">slingshot</span>(deng_SCE, <span class="dt">clusterLabels =</span> <span class="st">'cell_type2'</span>,<span class="dt">reducedDim =</span> <span class="st">"PCA"</span>,</a> -<a class="sourceLine" id="cb649-4" data-line-number="4"> <span class="dt">allow.breaks =</span> <span class="ot">FALSE</span>)</a></code></pre></div> +<div class="sourceCode" id="cb731"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb731-1" data-line-number="1"><span class="co">## runing slingshot</span></a> +<a class="sourceLine" id="cb731-2" data-line-number="2"></a> +<a class="sourceLine" id="cb731-3" data-line-number="3">deng_SCE <-<span class="st"> </span><span class="kw">slingshot</span>(deng_SCE, <span class="dt">clusterLabels =</span> <span class="st">'cell_type2'</span>,<span class="dt">reducedDim =</span> <span class="st">"PCA"</span>,</a> +<a class="sourceLine" id="cb731-4" data-line-number="4"> <span class="dt">allow.breaks =</span> <span class="ot">FALSE</span>)</a></code></pre></div> <pre><code>## Using diagonal covariance matrix</code></pre> -<div class="sourceCode" id="cb651"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb651-1" data-line-number="1"><span class="kw">summary</span>(deng_SCE<span class="op">$</span>slingPseudotime_<span class="dv">1</span>)</a></code></pre></div> +<div class="sourceCode" id="cb733"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb733-1" data-line-number="1"><span class="kw">summary</span>(deng_SCE<span class="op">$</span>slingPseudotime_<span class="dv">1</span>)</a></code></pre></div> <pre><code>## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's ## 0.00 52.19 59.81 60.34 81.60 85.72 55</code></pre> -<div class="sourceCode" id="cb653"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb653-1" data-line-number="1"><span class="co">## get lineages inferred by slingshot</span></a> -<a class="sourceLine" id="cb653-2" data-line-number="2">lnes <-<span class="st"> </span><span class="kw">getLineages</span>(<span class="kw">reducedDim</span>(deng_SCE,<span class="st">"PCA"</span>),</a> -<a class="sourceLine" id="cb653-3" data-line-number="3"> deng_SCE<span class="op">$</span>cell_type2)</a></code></pre></div> +<div class="sourceCode" id="cb735"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb735-1" data-line-number="1"><span class="co">## get lineages inferred by slingshot</span></a> +<a class="sourceLine" id="cb735-2" data-line-number="2">lnes <-<span class="st"> </span><span class="kw">getLineages</span>(<span class="kw">reducedDim</span>(deng_SCE,<span class="st">"PCA"</span>),</a> +<a class="sourceLine" id="cb735-3" data-line-number="3"> deng_SCE<span class="op">$</span>cell_type2)</a></code></pre></div> <pre><code>## Using diagonal covariance matrix</code></pre> -<div class="sourceCode" id="cb655"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb655-1" data-line-number="1">lnes<span class="op">@</span>lineages</a></code></pre></div> +<div class="sourceCode" id="cb737"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb737-1" data-line-number="1">lnes<span class="op">@</span>lineages</a></code></pre></div> <pre><code>## $Lineage1 ## [1] "zy" "early2cell" "mid2cell" "late2cell" "4cell" ## [6] "16cell" "midblast" "earlyblast" @@ -677,48 +677,82 @@ nonlinear summary of the data. They are nonparametric, and their shape is sugges ## $Lineage3 ## [1] "zy" "early2cell" "mid2cell" "late2cell" "4cell" ## [6] "16cell" "8cell"</code></pre> -<div class="sourceCode" id="cb657"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb657-1" data-line-number="1"><span class="co">## plot the lineage overlay on the orginal PCA plot</span></a> -<a class="sourceLine" id="cb657-2" data-line-number="2"></a> -<a class="sourceLine" id="cb657-3" data-line-number="3"><span class="kw">plot</span>(<span class="kw">reducedDims</span>(deng_SCE)<span class="op">$</span>PCA, <span class="dt">col =</span> my_color[<span class="kw">as.character</span>(deng_SCE<span class="op">$</span>cell_type2)], </a> -<a class="sourceLine" id="cb657-4" data-line-number="4"> <span class="dt">pch=</span><span class="dv">16</span>, </a> -<a class="sourceLine" id="cb657-5" data-line-number="5"> <span class="dt">asp =</span> <span class="dv">1</span>)</a> -<a class="sourceLine" id="cb657-6" data-line-number="6"><span class="kw">legend</span>(<span class="st">"bottomleft"</span>,<span class="dt">legend =</span> <span class="kw">names</span>(my_color[<span class="kw">levels</span>(deng_SCE<span class="op">$</span>cell_type2)]), </a> -<a class="sourceLine" id="cb657-7" data-line-number="7"> <span class="dt">fill =</span> my_color[<span class="kw">levels</span>(deng_SCE<span class="op">$</span>cell_type2)])</a> -<a class="sourceLine" id="cb657-8" data-line-number="8"><span class="kw">lines</span>(<span class="kw">SlingshotDataSet</span>(deng_SCE), <span class="dt">lwd=</span><span class="dv">2</span>, <span class="dt">type =</span> <span class="st">'lineages'</span>, <span class="dt">col =</span> <span class="kw">c</span>(<span class="st">"black"</span>))</a></code></pre></div> +<div class="sourceCode" id="cb739"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb739-1" data-line-number="1"><span class="co">## plot the lineage overlay on the orginal PCA plot</span></a> +<a class="sourceLine" id="cb739-2" data-line-number="2"></a> +<a class="sourceLine" id="cb739-3" data-line-number="3"><span class="kw">plot</span>(<span class="kw">reducedDims</span>(deng_SCE)<span class="op">$</span>PCA, <span class="dt">col =</span> my_color[<span class="kw">as.character</span>(deng_SCE<span class="op">$</span>cell_type2)], </a> +<a class="sourceLine" id="cb739-4" data-line-number="4"> <span class="dt">pch=</span><span class="dv">16</span>, </a> +<a class="sourceLine" id="cb739-5" data-line-number="5"> <span class="dt">asp =</span> <span class="dv">1</span>)</a> +<a class="sourceLine" id="cb739-6" data-line-number="6"><span class="kw">legend</span>(<span class="st">"bottomleft"</span>,<span class="dt">legend =</span> <span class="kw">names</span>(my_color[<span class="kw">levels</span>(deng_SCE<span class="op">$</span>cell_type2)]), </a> +<a class="sourceLine" id="cb739-7" data-line-number="7"> <span class="dt">fill =</span> my_color[<span class="kw">levels</span>(deng_SCE<span class="op">$</span>cell_type2)])</a> +<a class="sourceLine" id="cb739-8" data-line-number="8"><span class="kw">lines</span>(<span class="kw">SlingshotDataSet</span>(deng_SCE), <span class="dt">lwd=</span><span class="dv">2</span>, <span class="dt">type =</span> <span class="st">'lineages'</span>, <span class="dt">col =</span> <span class="kw">c</span>(<span class="st">"black"</span>))</a></code></pre></div> <p><img src="pseudotime_files/figure-html/run_slingshot-1.png" width="90%" style="display: block; margin: auto;" /></p> -<div class="sourceCode" id="cb658"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb658-1" data-line-number="1"><span class="co">## Plotting the pseudotime inferred by slingshot by cell types</span></a> -<a class="sourceLine" id="cb658-2" data-line-number="2"></a> -<a class="sourceLine" id="cb658-3" data-line-number="3">slingshot_df <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="kw">colData</span>(deng_SCE))</a> -<a class="sourceLine" id="cb658-4" data-line-number="4"></a> -<a class="sourceLine" id="cb658-5" data-line-number="5"><span class="kw">ggplot</span>(slingshot_df, <span class="kw">aes</span>(<span class="dt">x =</span> slingPseudotime_<span class="dv">1</span>, <span class="dt">y =</span> cell_type2, </a> -<a class="sourceLine" id="cb658-6" data-line-number="6"> <span class="dt">colour =</span> cell_type2)) <span class="op">+</span></a> -<a class="sourceLine" id="cb658-7" data-line-number="7"><span class="st"> </span><span class="kw">geom_quasirandom</span>(<span class="dt">groupOnX =</span> <span class="ot">FALSE</span>) <span class="op">+</span><span class="st"> </span><span class="kw">theme_classic</span>() <span class="op">+</span></a> -<a class="sourceLine" id="cb658-8" data-line-number="8"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"First Slingshot pseudotime"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"cell type"</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb658-9" data-line-number="9"><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"Cells ordered by Slingshot pseudotime"</span>)<span class="op">+</span><span class="kw">scale_colour_manual</span>(<span class="dt">values =</span> my_color)</a></code></pre></div> +<div class="sourceCode" id="cb740"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb740-1" data-line-number="1"><span class="co">## Plotting the pseudotime inferred by slingshot by cell types</span></a> +<a class="sourceLine" id="cb740-2" data-line-number="2"></a> +<a class="sourceLine" id="cb740-3" data-line-number="3">slingshot_df <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="kw">colData</span>(deng_SCE))</a> +<a class="sourceLine" id="cb740-4" data-line-number="4"></a> +<a class="sourceLine" id="cb740-5" data-line-number="5"><span class="kw">ggplot</span>(slingshot_df, <span class="kw">aes</span>(<span class="dt">x =</span> slingPseudotime_<span class="dv">1</span>, <span class="dt">y =</span> cell_type2, </a> +<a class="sourceLine" id="cb740-6" data-line-number="6"> <span class="dt">colour =</span> cell_type2)) <span class="op">+</span></a> +<a class="sourceLine" id="cb740-7" data-line-number="7"><span class="st"> </span><span class="kw">geom_quasirandom</span>(<span class="dt">groupOnX =</span> <span class="ot">FALSE</span>) <span class="op">+</span><span class="st"> </span><span class="kw">theme_classic</span>() <span class="op">+</span></a> +<a class="sourceLine" id="cb740-8" data-line-number="8"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"First Slingshot pseudotime"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"cell type"</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb740-9" data-line-number="9"><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"Cells ordered by Slingshot pseudotime"</span>)<span class="op">+</span><span class="kw">scale_colour_manual</span>(<span class="dt">values =</span> my_color)</a></code></pre></div> <p><img src="pseudotime_files/figure-html/run_slingshot-2.png" width="90%" style="display: block; margin: auto;" /></p> -<div class="sourceCode" id="cb659"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb659-1" data-line-number="1"><span class="kw">ggplot</span>(slingshot_df, <span class="kw">aes</span>(<span class="dt">x =</span> slingPseudotime_<span class="dv">2</span>, <span class="dt">y =</span> cell_type2, </a> -<a class="sourceLine" id="cb659-2" data-line-number="2"> <span class="dt">colour =</span> cell_type2)) <span class="op">+</span></a> -<a class="sourceLine" id="cb659-3" data-line-number="3"><span class="st"> </span><span class="kw">geom_quasirandom</span>(<span class="dt">groupOnX =</span> <span class="ot">FALSE</span>) <span class="op">+</span><span class="st"> </span><span class="kw">theme_classic</span>() <span class="op">+</span></a> -<a class="sourceLine" id="cb659-4" data-line-number="4"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"Second Slingshot pseudotime"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"cell type"</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb659-5" data-line-number="5"><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"Cells ordered by Slingshot pseudotime"</span>)<span class="op">+</span><span class="kw">scale_colour_manual</span>(<span class="dt">values =</span> my_color)</a></code></pre></div> +<div class="sourceCode" id="cb741"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb741-1" data-line-number="1"><span class="kw">ggplot</span>(slingshot_df, <span class="kw">aes</span>(<span class="dt">x =</span> slingPseudotime_<span class="dv">2</span>, <span class="dt">y =</span> cell_type2, </a> +<a class="sourceLine" id="cb741-2" data-line-number="2"> <span class="dt">colour =</span> cell_type2)) <span class="op">+</span></a> +<a class="sourceLine" id="cb741-3" data-line-number="3"><span class="st"> </span><span class="kw">geom_quasirandom</span>(<span class="dt">groupOnX =</span> <span class="ot">FALSE</span>) <span class="op">+</span><span class="st"> </span><span class="kw">theme_classic</span>() <span class="op">+</span></a> +<a class="sourceLine" id="cb741-4" data-line-number="4"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"Second Slingshot pseudotime"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"cell type"</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb741-5" data-line-number="5"><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"Cells ordered by Slingshot pseudotime"</span>)<span class="op">+</span><span class="kw">scale_colour_manual</span>(<span class="dt">values =</span> my_color)</a></code></pre></div> <p><img src="pseudotime_files/figure-html/run_slingshot-3.png" width="90%" style="display: block; margin: auto;" /></p> -<div class="sourceCode" id="cb660"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb660-1" data-line-number="1"><span class="kw">ggplot</span>(slingshot_df, <span class="kw">aes</span>(<span class="dt">x =</span> slingPseudotime_<span class="dv">1</span>, <span class="dt">y =</span> slingPseudotime_<span class="dv">2</span>, </a> -<a class="sourceLine" id="cb660-2" data-line-number="2"> <span class="dt">colour =</span> cell_type2)) <span class="op">+</span></a> -<a class="sourceLine" id="cb660-3" data-line-number="3"><span class="st"> </span><span class="kw">geom_quasirandom</span>(<span class="dt">groupOnX =</span> <span class="ot">FALSE</span>) <span class="op">+</span><span class="st"> </span><span class="kw">theme_classic</span>() <span class="op">+</span></a> -<a class="sourceLine" id="cb660-4" data-line-number="4"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"First Slingshot pseudotime"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Second Slingshot pseudotime"</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb660-5" data-line-number="5"><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"Cells ordered by Slingshot pseudotime"</span>)<span class="op">+</span><span class="kw">scale_colour_manual</span>(<span class="dt">values =</span> my_color)</a></code></pre></div> +<div class="sourceCode" id="cb742"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb742-1" data-line-number="1"><span class="kw">ggplot</span>(slingshot_df, <span class="kw">aes</span>(<span class="dt">x =</span> slingPseudotime_<span class="dv">1</span>, <span class="dt">y =</span> slingPseudotime_<span class="dv">2</span>, </a> +<a class="sourceLine" id="cb742-2" data-line-number="2"> <span class="dt">colour =</span> cell_type2)) <span class="op">+</span></a> +<a class="sourceLine" id="cb742-3" data-line-number="3"><span class="st"> </span><span class="kw">geom_quasirandom</span>(<span class="dt">groupOnX =</span> <span class="ot">FALSE</span>) <span class="op">+</span><span class="st"> </span><span class="kw">theme_classic</span>() <span class="op">+</span></a> +<a class="sourceLine" id="cb742-4" data-line-number="4"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"First Slingshot pseudotime"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Second Slingshot pseudotime"</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb742-5" data-line-number="5"><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"Cells ordered by Slingshot pseudotime"</span>)<span class="op">+</span><span class="kw">scale_colour_manual</span>(<span class="dt">values =</span> my_color)</a></code></pre></div> <p><img src="pseudotime_files/figure-html/run_slingshot-4.png" width="90%" style="display: block; margin: auto;" /></p> -<div class="sourceCode" id="cb661"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb661-1" data-line-number="1"><span class="co"># </span></a> -<a class="sourceLine" id="cb661-2" data-line-number="2"><span class="co"># ggplot(slingshot_df, aes(x = slingPseudotime_1, y = slingPseudotime_2, </span></a> -<a class="sourceLine" id="cb661-3" data-line-number="3"><span class="co"># colour = slingPseudotime_3)) +</span></a> -<a class="sourceLine" id="cb661-4" data-line-number="4"><span class="co"># geom_point() + theme_classic() +</span></a> -<a class="sourceLine" id="cb661-5" data-line-number="5"><span class="co"># xlab("First Slingshot pseudotime") + ylab("Second Slingshot pseudotime") +</span></a> -<a class="sourceLine" id="cb661-6" data-line-number="6"><span class="co"># ggtitle("Cells ordered by Slingshot pseudotime")+facet_wrap(.~cell_type2)</span></a></code></pre></div> +<div class="sourceCode" id="cb743"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb743-1" data-line-number="1"><span class="co"># </span></a> +<a class="sourceLine" id="cb743-2" data-line-number="2"><span class="co"># ggplot(slingshot_df, aes(x = slingPseudotime_1, y = slingPseudotime_2, </span></a> +<a class="sourceLine" id="cb743-3" data-line-number="3"><span class="co"># colour = slingPseudotime_3)) +</span></a> +<a class="sourceLine" id="cb743-4" data-line-number="4"><span class="co"># geom_point() + theme_classic() +</span></a> +<a class="sourceLine" id="cb743-5" data-line-number="5"><span class="co"># xlab("First Slingshot pseudotime") + ylab("Second Slingshot pseudotime") +</span></a> +<a class="sourceLine" id="cb743-6" data-line-number="6"><span class="co"># ggtitle("Cells ordered by Slingshot pseudotime")+facet_wrap(.~cell_type2)</span></a></code></pre></div> <p><em>Note</em> You can also supply a start and an end cluster to <code>slingshot</code>.</p> <p><em>Comments</em> Did you notice the ordering of clusters in the lineage prediced for <code>16cells</code> state? There is an outlier-like cell in the 16cell group, find the outlier and remove it, then re-run <code>Slingshot</code>.</p> </div> +<div id="gam-general-additive-model-for-identifying-temporally-expressed-genes" class="section level3"> +<h3><span class="header-section-number">11.1.3</span> GAM general additive model for identifying temporally expressed genes</h3> +<p>After running slingshot, an interesting next step may be to find genes that change their expression over the course of development. We demonstrate one possible method for this type of analysis on the 100 most variable genes. We will regress each gene on the pseudotime variable we have generated, using a general additive model (GAM). This allows us to detect non-linear patterns in gene expression.</p> +<div class="sourceCode" id="cb744"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb744-1" data-line-number="1"><span class="kw">library</span>(gam)</a> +<a class="sourceLine" id="cb744-2" data-line-number="2">t <-<span class="st"> </span>deng_SCE<span class="op">$</span>slingPseudotime_<span class="dv">1</span></a> +<a class="sourceLine" id="cb744-3" data-line-number="3"></a> +<a class="sourceLine" id="cb744-4" data-line-number="4"><span class="co"># for time, only look at the 100 most variable genes </span></a> +<a class="sourceLine" id="cb744-5" data-line-number="5">Y <-<span class="st"> </span><span class="kw">log1p</span>(<span class="kw">assay</span>(deng_SCE,<span class="st">"logcounts"</span>))</a> +<a class="sourceLine" id="cb744-6" data-line-number="6"></a> +<a class="sourceLine" id="cb744-7" data-line-number="7">var100 <-<span class="st"> </span><span class="kw">names</span>(<span class="kw">sort</span>(<span class="kw">apply</span>(Y,<span class="dv">1</span>,var),<span class="dt">decreasing =</span> <span class="ot">TRUE</span>))[<span class="dv">1</span><span class="op">:</span><span class="dv">100</span>]</a> +<a class="sourceLine" id="cb744-8" data-line-number="8">Y <-<span class="st"> </span>Y[var100,]</a> +<a class="sourceLine" id="cb744-9" data-line-number="9"></a> +<a class="sourceLine" id="cb744-10" data-line-number="10"><span class="co"># fit a GAM with a loess term for pseudotime</span></a> +<a class="sourceLine" id="cb744-11" data-line-number="11">gam.pval <-<span class="st"> </span><span class="kw">apply</span>(Y,<span class="dv">1</span>,<span class="cf">function</span>(z){</a> +<a class="sourceLine" id="cb744-12" data-line-number="12"> d <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="dt">z=</span>z, <span class="dt">t=</span>t)</a> +<a class="sourceLine" id="cb744-13" data-line-number="13"> <span class="kw">suppressWarnings</span>({</a> +<a class="sourceLine" id="cb744-14" data-line-number="14"> tmp <-<span class="st"> </span><span class="kw">gam</span>(z <span class="op">~</span><span class="st"> </span><span class="kw">lo</span>(t), <span class="dt">data=</span>d)</a> +<a class="sourceLine" id="cb744-15" data-line-number="15"> })</a> +<a class="sourceLine" id="cb744-16" data-line-number="16"> p <-<span class="st"> </span><span class="kw">summary</span>(tmp)[<span class="dv">3</span>][[<span class="dv">1</span>]][<span class="dv">2</span>,<span class="dv">3</span>]</a> +<a class="sourceLine" id="cb744-17" data-line-number="17"> p</a> +<a class="sourceLine" id="cb744-18" data-line-number="18">})</a> +<a class="sourceLine" id="cb744-19" data-line-number="19"></a> +<a class="sourceLine" id="cb744-20" data-line-number="20"><span class="co">## Plot the top 100 genes' expression </span></a> +<a class="sourceLine" id="cb744-21" data-line-number="21"></a> +<a class="sourceLine" id="cb744-22" data-line-number="22">topgenes <-<span class="st"> </span><span class="kw">names</span>(<span class="kw">sort</span>(gam.pval, <span class="dt">decreasing =</span> <span class="ot">FALSE</span>))[<span class="dv">1</span><span class="op">:</span><span class="dv">100</span>]</a> +<a class="sourceLine" id="cb744-23" data-line-number="23"></a> +<a class="sourceLine" id="cb744-24" data-line-number="24">heatdata <-<span class="st"> </span><span class="kw">assays</span>(deng_SCE)<span class="op">$</span>logcounts[topgenes, <span class="kw">order</span>(t, <span class="dt">na.last =</span> <span class="ot">NA</span>)]</a> +<a class="sourceLine" id="cb744-25" data-line-number="25">heatclus <-<span class="st"> </span>deng_SCE<span class="op">$</span>cell_type2[<span class="kw">order</span>(t, <span class="dt">na.last =</span> <span class="ot">NA</span>)]</a> +<a class="sourceLine" id="cb744-26" data-line-number="26"></a> +<a class="sourceLine" id="cb744-27" data-line-number="27"><span class="kw">heatmap</span>(heatdata, <span class="dt">Colv =</span> <span class="ot">NA</span>,</a> +<a class="sourceLine" id="cb744-28" data-line-number="28"> <span class="dt">ColSideColors =</span> my_color[heatclus],<span class="dt">cexRow =</span> <span class="dv">1</span>,<span class="dt">cexCol =</span> <span class="dv">1</span>)</a></code></pre></div> +<p><img src="pseudotime_files/figure-html/gam_tm_deg-1.png" width="90%" style="display: block; margin: auto;" /></p> +<p>We will regress each gene on the pseudotime variable we have generated, using a general additive model (GAM). This allows us to detect non-linear patterns in gene expression.</p> +</div> <div id="monocle" class="section level3"> -<h3><span class="header-section-number">11.1.3</span> Monocle</h3> +<h3><span class="header-section-number">11.1.4</span> Monocle</h3> <p>The original <code>Monocle</code> <span class="citation">(Trapnell et al. <a href="#ref-Trapnell2014-os">2014</a>)</span> method skips the clustering stage of TSCAN and directly builds a minimum spanning tree on a reduced dimension representation (using ‘ICA’) of the cells to connect all cells. <code>Monocle</code> then identifies the longest path @@ -728,7 +762,7 @@ cell-types), monocle can identify these. Each of the resulting forked paths is defined as a separate cell state.</p> </div> <div id="monocle-2" class="section level3"> -<h3><span class="header-section-number">11.1.4</span> Monocle 2</h3> +<h3><span class="header-section-number">11.1.5</span> Monocle 2</h3> <p><code>Monocle 2</code> <span class="citation">(Qiu et al. <a href="#ref-Qiu2017-xq">2017</a>)</span> uses a different approach, with dimensionality reduction and ordering performed by reverse graph embedding (RGE), allowing it to detect branching events in an unsupervised manner. RGE, a machine-learning strategy, learns a ‘principal graph’ to describe the single-cell dataset. RGE also learns the mapping function of data points on the trajectory back to the original high dimentional space simutaneously. In doing so, it aims to position the latent points in the lower dimension space (along the trajectory) while also ensuring their corresponding @@ -738,65 +772,65 @@ DDRTree learns latent points and the projection of latent points to the points i <p>DDRTree returns a principal tree of the centroids of cell clusters in low dimension, pseudotime is derived for individual cells by calculating geomdestic distance of their projections onto the tree from the root (user-defined or arbitrarily assigned).</p> <p><strong>Note</strong> Informally, a principal graph is like a principal curve which passes through the ‘middle’ of a data set but is allowed to have branches.</p> -<div class="sourceCode" id="cb662"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb662-1" data-line-number="1"><span class="kw">library</span>(monocle)</a> -<a class="sourceLine" id="cb662-2" data-line-number="2"><span class="co">#d <- deng_SCE[m3dGenes,]</span></a> -<a class="sourceLine" id="cb662-3" data-line-number="3"><span class="co">## feature selection </span></a> -<a class="sourceLine" id="cb662-4" data-line-number="4">deng <-<span class="st"> </span><span class="kw">counts</span>(deng_SCE)</a> -<a class="sourceLine" id="cb662-5" data-line-number="5"></a> -<a class="sourceLine" id="cb662-6" data-line-number="6">m3dGenes <-<span class="st"> </span><span class="kw">as.character</span>(</a> -<a class="sourceLine" id="cb662-7" data-line-number="7"> <span class="kw">M3DropFeatureSelection</span>(deng)<span class="op">$</span>Gene</a> -<a class="sourceLine" id="cb662-8" data-line-number="8">)</a></code></pre></div> +<div class="sourceCode" id="cb745"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb745-1" data-line-number="1"><span class="kw">library</span>(monocle)</a> +<a class="sourceLine" id="cb745-2" data-line-number="2"><span class="co">#d <- deng_SCE[m3dGenes,]</span></a> +<a class="sourceLine" id="cb745-3" data-line-number="3"><span class="co">## feature selection </span></a> +<a class="sourceLine" id="cb745-4" data-line-number="4">deng <-<span class="st"> </span><span class="kw">counts</span>(deng_SCE)</a> +<a class="sourceLine" id="cb745-5" data-line-number="5"></a> +<a class="sourceLine" id="cb745-6" data-line-number="6">m3dGenes <-<span class="st"> </span><span class="kw">as.character</span>(</a> +<a class="sourceLine" id="cb745-7" data-line-number="7"> <span class="kw">M3DropFeatureSelection</span>(deng)<span class="op">$</span>Gene</a> +<a class="sourceLine" id="cb745-8" data-line-number="8">)</a></code></pre></div> <p><img src="pseudotime_files/figure-html/monocle2-all-genes-1.png" width="90%" style="display: block; margin: auto;" /></p> -<div class="sourceCode" id="cb663"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb663-1" data-line-number="1">d <-<span class="st"> </span>deng_SCE[<span class="kw">which</span>(<span class="kw">rownames</span>(deng_SCE) <span class="op">%in%</span><span class="st"> </span>m3dGenes), ]</a> -<a class="sourceLine" id="cb663-2" data-line-number="2">d <-<span class="st"> </span>d[<span class="op">!</span><span class="kw">duplicated</span>(<span class="kw">rownames</span>(d)), ]</a> -<a class="sourceLine" id="cb663-3" data-line-number="3"></a> -<a class="sourceLine" id="cb663-4" data-line-number="4"><span class="kw">colnames</span>(d) <-<span class="st"> </span><span class="dv">1</span><span class="op">:</span><span class="kw">ncol</span>(d)</a> -<a class="sourceLine" id="cb663-5" data-line-number="5">geneNames <-<span class="st"> </span><span class="kw">rownames</span>(d)</a> -<a class="sourceLine" id="cb663-6" data-line-number="6"><span class="kw">rownames</span>(d) <-<span class="st"> </span><span class="dv">1</span><span class="op">:</span><span class="kw">nrow</span>(d)</a> -<a class="sourceLine" id="cb663-7" data-line-number="7">pd <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="dt">timepoint =</span> cellLabels)</a> -<a class="sourceLine" id="cb663-8" data-line-number="8">pd <-<span class="st"> </span><span class="kw">new</span>(<span class="st">"AnnotatedDataFrame"</span>, <span class="dt">data=</span>pd)</a> -<a class="sourceLine" id="cb663-9" data-line-number="9">fd <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="dt">gene_short_name =</span> geneNames)</a> -<a class="sourceLine" id="cb663-10" data-line-number="10">fd <-<span class="st"> </span><span class="kw">new</span>(<span class="st">"AnnotatedDataFrame"</span>, <span class="dt">data=</span>fd)</a> -<a class="sourceLine" id="cb663-11" data-line-number="11"></a> -<a class="sourceLine" id="cb663-12" data-line-number="12">dCellData <-<span class="st"> </span><span class="kw">newCellDataSet</span>(<span class="kw">counts</span>(d), <span class="dt">phenoData =</span> pd, <span class="dt">featureData =</span> fd)</a> -<a class="sourceLine" id="cb663-13" data-line-number="13"><span class="co">#</span></a> -<a class="sourceLine" id="cb663-14" data-line-number="14">dCellData <-<span class="st"> </span><span class="kw">setOrderingFilter</span>(dCellData, <span class="kw">which</span>(geneNames <span class="op">%in%</span><span class="st"> </span>m3dGenes))</a> -<a class="sourceLine" id="cb663-15" data-line-number="15">dCellData <-<span class="st"> </span><span class="kw">estimateSizeFactors</span>(dCellData)</a> -<a class="sourceLine" id="cb663-16" data-line-number="16">dCellDataSet <-<span class="st"> </span><span class="kw">reduceDimension</span>(dCellData,<span class="dt">reduction_method =</span> <span class="st">"DDRTree"</span>, <span class="dt">pseudo_expr =</span> <span class="dv">1</span>)</a> -<a class="sourceLine" id="cb663-17" data-line-number="17">dCellDataSet <-<span class="st"> </span><span class="kw">orderCells</span>(dCellDataSet, <span class="dt">reverse =</span> <span class="ot">FALSE</span>)</a> -<a class="sourceLine" id="cb663-18" data-line-number="18"><span class="kw">plot_cell_trajectory</span>(dCellDataSet)</a></code></pre></div> +<div class="sourceCode" id="cb746"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb746-1" data-line-number="1">d <-<span class="st"> </span>deng_SCE[<span class="kw">which</span>(<span class="kw">rownames</span>(deng_SCE) <span class="op">%in%</span><span class="st"> </span>m3dGenes), ]</a> +<a class="sourceLine" id="cb746-2" data-line-number="2">d <-<span class="st"> </span>d[<span class="op">!</span><span class="kw">duplicated</span>(<span class="kw">rownames</span>(d)), ]</a> +<a class="sourceLine" id="cb746-3" data-line-number="3"></a> +<a class="sourceLine" id="cb746-4" data-line-number="4"><span class="kw">colnames</span>(d) <-<span class="st"> </span><span class="dv">1</span><span class="op">:</span><span class="kw">ncol</span>(d)</a> +<a class="sourceLine" id="cb746-5" data-line-number="5">geneNames <-<span class="st"> </span><span class="kw">rownames</span>(d)</a> +<a class="sourceLine" id="cb746-6" data-line-number="6"><span class="kw">rownames</span>(d) <-<span class="st"> </span><span class="dv">1</span><span class="op">:</span><span class="kw">nrow</span>(d)</a> +<a class="sourceLine" id="cb746-7" data-line-number="7">pd <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="dt">timepoint =</span> cellLabels)</a> +<a class="sourceLine" id="cb746-8" data-line-number="8">pd <-<span class="st"> </span><span class="kw">new</span>(<span class="st">"AnnotatedDataFrame"</span>, <span class="dt">data=</span>pd)</a> +<a class="sourceLine" id="cb746-9" data-line-number="9">fd <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="dt">gene_short_name =</span> geneNames)</a> +<a class="sourceLine" id="cb746-10" data-line-number="10">fd <-<span class="st"> </span><span class="kw">new</span>(<span class="st">"AnnotatedDataFrame"</span>, <span class="dt">data=</span>fd)</a> +<a class="sourceLine" id="cb746-11" data-line-number="11"></a> +<a class="sourceLine" id="cb746-12" data-line-number="12">dCellData <-<span class="st"> </span><span class="kw">newCellDataSet</span>(<span class="kw">counts</span>(d), <span class="dt">phenoData =</span> pd, <span class="dt">featureData =</span> fd)</a> +<a class="sourceLine" id="cb746-13" data-line-number="13"><span class="co">#</span></a> +<a class="sourceLine" id="cb746-14" data-line-number="14">dCellData <-<span class="st"> </span><span class="kw">setOrderingFilter</span>(dCellData, <span class="kw">which</span>(geneNames <span class="op">%in%</span><span class="st"> </span>m3dGenes))</a> +<a class="sourceLine" id="cb746-15" data-line-number="15">dCellData <-<span class="st"> </span><span class="kw">estimateSizeFactors</span>(dCellData)</a> +<a class="sourceLine" id="cb746-16" data-line-number="16">dCellDataSet <-<span class="st"> </span><span class="kw">reduceDimension</span>(dCellData,<span class="dt">reduction_method =</span> <span class="st">"DDRTree"</span>, <span class="dt">pseudo_expr =</span> <span class="dv">1</span>)</a> +<a class="sourceLine" id="cb746-17" data-line-number="17">dCellDataSet <-<span class="st"> </span><span class="kw">orderCells</span>(dCellDataSet, <span class="dt">reverse =</span> <span class="ot">FALSE</span>)</a> +<a class="sourceLine" id="cb746-18" data-line-number="18"><span class="kw">plot_cell_trajectory</span>(dCellDataSet)</a></code></pre></div> <p><img src="pseudotime_files/figure-html/monocle2-all-genes-2.png" width="90%" style="display: block; margin: auto;" /></p> -<div class="sourceCode" id="cb664"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb664-1" data-line-number="1"><span class="co"># Store the ordering</span></a> -<a class="sourceLine" id="cb664-2" data-line-number="2">pseudotime_monocle2 <-</a> -<a class="sourceLine" id="cb664-3" data-line-number="3"><span class="st"> </span><span class="kw">data.frame</span>(</a> -<a class="sourceLine" id="cb664-4" data-line-number="4"> <span class="dt">Timepoint =</span> <span class="kw">phenoData</span>(dCellDataSet)<span class="op">$</span>timepoint,</a> -<a class="sourceLine" id="cb664-5" data-line-number="5"> <span class="dt">pseudotime =</span> <span class="kw">phenoData</span>(dCellDataSet)<span class="op">$</span>Pseudotime,</a> -<a class="sourceLine" id="cb664-6" data-line-number="6"> <span class="dt">State =</span> <span class="kw">phenoData</span>(dCellDataSet)<span class="op">$</span>State</a> -<a class="sourceLine" id="cb664-7" data-line-number="7"> )</a> -<a class="sourceLine" id="cb664-8" data-line-number="8"><span class="kw">rownames</span>(pseudotime_monocle2) <-<span class="st"> </span><span class="dv">1</span><span class="op">:</span><span class="kw">ncol</span>(d)</a> -<a class="sourceLine" id="cb664-9" data-line-number="9">pseudotime_order_monocle <-</a> -<a class="sourceLine" id="cb664-10" data-line-number="10"><span class="st"> </span><span class="kw">rownames</span>(pseudotime_monocle2[<span class="kw">order</span>(pseudotime_monocle2<span class="op">$</span>pseudotime), ])</a></code></pre></div> +<div class="sourceCode" id="cb747"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb747-1" data-line-number="1"><span class="co"># Store the ordering</span></a> +<a class="sourceLine" id="cb747-2" data-line-number="2">pseudotime_monocle2 <-</a> +<a class="sourceLine" id="cb747-3" data-line-number="3"><span class="st"> </span><span class="kw">data.frame</span>(</a> +<a class="sourceLine" id="cb747-4" data-line-number="4"> <span class="dt">Timepoint =</span> <span class="kw">phenoData</span>(dCellDataSet)<span class="op">$</span>timepoint,</a> +<a class="sourceLine" id="cb747-5" data-line-number="5"> <span class="dt">pseudotime =</span> <span class="kw">phenoData</span>(dCellDataSet)<span class="op">$</span>Pseudotime,</a> +<a class="sourceLine" id="cb747-6" data-line-number="6"> <span class="dt">State =</span> <span class="kw">phenoData</span>(dCellDataSet)<span class="op">$</span>State</a> +<a class="sourceLine" id="cb747-7" data-line-number="7"> )</a> +<a class="sourceLine" id="cb747-8" data-line-number="8"><span class="kw">rownames</span>(pseudotime_monocle2) <-<span class="st"> </span><span class="dv">1</span><span class="op">:</span><span class="kw">ncol</span>(d)</a> +<a class="sourceLine" id="cb747-9" data-line-number="9">pseudotime_order_monocle <-</a> +<a class="sourceLine" id="cb747-10" data-line-number="10"><span class="st"> </span><span class="kw">rownames</span>(pseudotime_monocle2[<span class="kw">order</span>(pseudotime_monocle2<span class="op">$</span>pseudotime), ])</a></code></pre></div> <p><em>Note</em> check other available methods for <code>?reduceDimension</code></p> <p>We can again compare the inferred pseudotime to the known sampling timepoints.</p> -<div class="sourceCode" id="cb665"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb665-1" data-line-number="1">deng_SCE<span class="op">$</span>pseudotime_monocle2 <-<span class="st"> </span>pseudotime_monocle2<span class="op">$</span>pseudotime</a> -<a class="sourceLine" id="cb665-2" data-line-number="2"></a> -<a class="sourceLine" id="cb665-3" data-line-number="3"><span class="kw">ggplot</span>(<span class="kw">as.data.frame</span>(<span class="kw">colData</span>(deng_SCE)), </a> -<a class="sourceLine" id="cb665-4" data-line-number="4"> <span class="kw">aes</span>(<span class="dt">x =</span> pseudotime_monocle2, </a> -<a class="sourceLine" id="cb665-5" data-line-number="5"> <span class="dt">y =</span> cell_type2, <span class="dt">colour =</span> cell_type2)) <span class="op">+</span></a> -<a class="sourceLine" id="cb665-6" data-line-number="6"><span class="st"> </span><span class="kw">geom_quasirandom</span>(<span class="dt">groupOnX =</span> <span class="ot">FALSE</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb665-7" data-line-number="7"><span class="st"> </span><span class="kw">scale_color_manual</span>(<span class="dt">values =</span> my_color) <span class="op">+</span><span class="st"> </span><span class="kw">theme_classic</span>() <span class="op">+</span></a> -<a class="sourceLine" id="cb665-8" data-line-number="8"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"monocle2 pseudotime"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Timepoint"</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb665-9" data-line-number="9"><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"Cells ordered by monocle2 pseudotime"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb748"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb748-1" data-line-number="1">deng_SCE<span class="op">$</span>pseudotime_monocle2 <-<span class="st"> </span>pseudotime_monocle2<span class="op">$</span>pseudotime</a> +<a class="sourceLine" id="cb748-2" data-line-number="2"></a> +<a class="sourceLine" id="cb748-3" data-line-number="3"><span class="kw">ggplot</span>(<span class="kw">as.data.frame</span>(<span class="kw">colData</span>(deng_SCE)), </a> +<a class="sourceLine" id="cb748-4" data-line-number="4"> <span class="kw">aes</span>(<span class="dt">x =</span> pseudotime_monocle2, </a> +<a class="sourceLine" id="cb748-5" data-line-number="5"> <span class="dt">y =</span> cell_type2, <span class="dt">colour =</span> cell_type2)) <span class="op">+</span></a> +<a class="sourceLine" id="cb748-6" data-line-number="6"><span class="st"> </span><span class="kw">geom_quasirandom</span>(<span class="dt">groupOnX =</span> <span class="ot">FALSE</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb748-7" data-line-number="7"><span class="st"> </span><span class="kw">scale_color_manual</span>(<span class="dt">values =</span> my_color) <span class="op">+</span><span class="st"> </span><span class="kw">theme_classic</span>() <span class="op">+</span></a> +<a class="sourceLine" id="cb748-8" data-line-number="8"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"monocle2 pseudotime"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Timepoint"</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb748-9" data-line-number="9"><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"Cells ordered by monocle2 pseudotime"</span>)</a></code></pre></div> <p><img src="pseudotime_files/figure-html/monocle-vs-truth-1.png" width="90%" style="display: block; margin: auto;" /></p> <p>Monocle 2 performs pretty well on these cells.</p> </div> <div id="monocle-3" class="section level3"> -<h3><span class="header-section-number">11.1.5</span> Monocle 3</h3> +<h3><span class="header-section-number">11.1.6</span> Monocle 3</h3> <p><a href="https://www.nature.com/articles/s41586-019-0969-x"><code>Monocle3</code></a><span class="citation">(Cao et al. <a href="#ref-Cao2019-cj">2019</a>)</span> is the updated single-cell analysis toolkit for analysing large datasets. <a href="https://cole-trapnell-lab.github.io/monocle3/docs/starting/">Monocle 3</a> is designed for use with absolute transcript counts (e.g. from UMI experiments). It first does dimension reduction with UMAP and then clusters the cells with Louvian/Leiden algorithms and merge adjacent groups into supergroup, and finaly resovles the trajectories individual cells can take during development, identifies the locations of branches and convergences within each supergroup.</p> <p>In short, Monocle3 uses <code>UMAP</code> to construct a initial trajectory inference and refines it with learning principal graph.</p> <p>It builds KNN graph in the UMAP dimensions and runs Louvain/Leiden algorithms om the KNN graph to derive communities; edges are drawn to connect communities that have more links (Partitioned Approximate Graph Abstraction (PAGA) graph). Each component of the PAGA grah is passed to the next step which is learning principal graph based on the SimplePPT algorithm. The pseudotime is calculated for individual cells by projecting the cells to their nearest point on the principal graph edge and measure geodesic distance along of principal points to the closest of their root nodes.</p> -<div class="sourceCode" id="cb666"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb666-1" data-line-number="1"><span class="kw">library</span>(monocle3)</a></code></pre></div> +<div class="sourceCode" id="cb749"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb749-1" data-line-number="1"><span class="kw">library</span>(monocle3)</a></code></pre></div> <pre><code>## ## Attaching package: 'monocle3'</code></pre> <pre><code>## The following objects are masked from 'package:monocle': @@ -806,91 +840,91 @@ the locations of branches and convergences within each supergroup.</p> <pre><code>## The following objects are masked from 'package:Biobase': ## ## exprs, fData, fData<-, pData, pData<-</code></pre> -<div class="sourceCode" id="cb670"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb670-1" data-line-number="1">gene_meta <-<span class="st"> </span><span class="kw">rowData</span>(deng_SCE)</a> -<a class="sourceLine" id="cb670-2" data-line-number="2"><span class="co">#gene_metadata must contain a column verbatim named 'gene_short_name' for certain functions.</span></a> -<a class="sourceLine" id="cb670-3" data-line-number="3">gene_meta<span class="op">$</span>gene_short_name <-<span class="st"> </span><span class="kw">rownames</span>(gene_meta)</a> -<a class="sourceLine" id="cb670-4" data-line-number="4">cds <-<span class="st"> </span><span class="kw">new_cell_data_set</span>(<span class="dt">expression_data =</span> <span class="kw">counts</span>(deng_SCE),</a> -<a class="sourceLine" id="cb670-5" data-line-number="5"> <span class="dt">cell_metadata =</span> <span class="kw">colData</span>(deng_SCE),</a> -<a class="sourceLine" id="cb670-6" data-line-number="6"> <span class="dt">gene_metadata =</span> gene_meta)</a> -<a class="sourceLine" id="cb670-7" data-line-number="7"></a> -<a class="sourceLine" id="cb670-8" data-line-number="8"><span class="co">## Step 1: Normalize and pre-process the data</span></a> -<a class="sourceLine" id="cb670-9" data-line-number="9">cds <-<span class="st"> </span><span class="kw">preprocess_cds</span>(cds,<span class="dt">num_dim =</span> <span class="dv">5</span>)</a> -<a class="sourceLine" id="cb670-10" data-line-number="10"><span class="kw">plot_pc_variance_explained</span>(cds)</a></code></pre></div> +<div class="sourceCode" id="cb753"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb753-1" data-line-number="1">gene_meta <-<span class="st"> </span><span class="kw">rowData</span>(deng_SCE)</a> +<a class="sourceLine" id="cb753-2" data-line-number="2"><span class="co">#gene_metadata must contain a column verbatim named 'gene_short_name' for certain functions.</span></a> +<a class="sourceLine" id="cb753-3" data-line-number="3">gene_meta<span class="op">$</span>gene_short_name <-<span class="st"> </span><span class="kw">rownames</span>(gene_meta)</a> +<a class="sourceLine" id="cb753-4" data-line-number="4">cds <-<span class="st"> </span><span class="kw">new_cell_data_set</span>(<span class="dt">expression_data =</span> <span class="kw">counts</span>(deng_SCE),</a> +<a class="sourceLine" id="cb753-5" data-line-number="5"> <span class="dt">cell_metadata =</span> <span class="kw">colData</span>(deng_SCE),</a> +<a class="sourceLine" id="cb753-6" data-line-number="6"> <span class="dt">gene_metadata =</span> gene_meta)</a> +<a class="sourceLine" id="cb753-7" data-line-number="7"></a> +<a class="sourceLine" id="cb753-8" data-line-number="8"><span class="co">## Step 1: Normalize and pre-process the data</span></a> +<a class="sourceLine" id="cb753-9" data-line-number="9">cds <-<span class="st"> </span><span class="kw">preprocess_cds</span>(cds,<span class="dt">num_dim =</span> <span class="dv">5</span>)</a> +<a class="sourceLine" id="cb753-10" data-line-number="10"><span class="kw">plot_pc_variance_explained</span>(cds)</a></code></pre></div> <p><img src="pseudotime_files/figure-html/run_monocle3-1.png" width="90%" style="display: block; margin: auto;" /></p> -<div class="sourceCode" id="cb671"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb671-1" data-line-number="1"><span class="co">## Step 3: Reduce the dimensions using UMAP</span></a> -<a class="sourceLine" id="cb671-2" data-line-number="2">cds <-<span class="st"> </span><span class="kw">reduce_dimension</span>(cds)</a></code></pre></div> +<div class="sourceCode" id="cb754"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb754-1" data-line-number="1"><span class="co">## Step 3: Reduce the dimensions using UMAP</span></a> +<a class="sourceLine" id="cb754-2" data-line-number="2">cds <-<span class="st"> </span><span class="kw">reduce_dimension</span>(cds)</a></code></pre></div> <pre><code>## No preprocess_method specified, using preprocess_method = 'PCA'</code></pre> -<div class="sourceCode" id="cb673"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb673-1" data-line-number="1"><span class="co">## Step 4: Cluster the cells</span></a> -<a class="sourceLine" id="cb673-2" data-line-number="2">cds <-<span class="st"> </span><span class="kw">cluster_cells</span>(cds)</a> -<a class="sourceLine" id="cb673-3" data-line-number="3"></a> -<a class="sourceLine" id="cb673-4" data-line-number="4"><span class="co">## change the clusters</span></a> -<a class="sourceLine" id="cb673-5" data-line-number="5"></a> -<a class="sourceLine" id="cb673-6" data-line-number="6"><span class="co">## cds@clusters$UMAP$clusters <- deng_SCE$cell_type2</span></a> -<a class="sourceLine" id="cb673-7" data-line-number="7"></a> -<a class="sourceLine" id="cb673-8" data-line-number="8"><span class="co">## Step 5: Learn a graph</span></a> -<a class="sourceLine" id="cb673-9" data-line-number="9">cds <-<span class="st"> </span><span class="kw">learn_graph</span>(cds,<span class="dt">use_partition =</span> <span class="ot">TRUE</span>)</a> -<a class="sourceLine" id="cb673-10" data-line-number="10"></a> -<a class="sourceLine" id="cb673-11" data-line-number="11"><span class="co">## Step 6: Order cells</span></a> -<a class="sourceLine" id="cb673-12" data-line-number="12">cds <-<span class="st"> </span><span class="kw">order_cells</span>(cds, <span class="dt">root_cells =</span> <span class="kw">c</span>(<span class="st">"zy"</span>,<span class="st">"zy.1"</span>,<span class="st">"zy.2"</span>,<span class="st">"zy.3"</span>) )</a> -<a class="sourceLine" id="cb673-13" data-line-number="13"></a> -<a class="sourceLine" id="cb673-14" data-line-number="14"><span class="kw">plot_cells</span>(cds, <span class="dt">color_cells_by=</span><span class="st">"cell_type2"</span>, <span class="dt">graph_label_size =</span> <span class="dv">4</span>, <span class="dt">cell_size =</span> <span class="dv">2</span>,</a> -<a class="sourceLine" id="cb673-15" data-line-number="15"> <span class="dt">group_label_size =</span> <span class="dv">6</span>)<span class="op">+</span><span class="st"> </span><span class="kw">scale_color_manual</span>(<span class="dt">values =</span> my_color) </a></code></pre></div> +<div class="sourceCode" id="cb756"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb756-1" data-line-number="1"><span class="co">## Step 4: Cluster the cells</span></a> +<a class="sourceLine" id="cb756-2" data-line-number="2">cds <-<span class="st"> </span><span class="kw">cluster_cells</span>(cds)</a> +<a class="sourceLine" id="cb756-3" data-line-number="3"></a> +<a class="sourceLine" id="cb756-4" data-line-number="4"><span class="co">## change the clusters</span></a> +<a class="sourceLine" id="cb756-5" data-line-number="5"></a> +<a class="sourceLine" id="cb756-6" data-line-number="6"><span class="co">## cds@clusters$UMAP$clusters <- deng_SCE$cell_type2</span></a> +<a class="sourceLine" id="cb756-7" data-line-number="7"></a> +<a class="sourceLine" id="cb756-8" data-line-number="8"><span class="co">## Step 5: Learn a graph</span></a> +<a class="sourceLine" id="cb756-9" data-line-number="9">cds <-<span class="st"> </span><span class="kw">learn_graph</span>(cds,<span class="dt">use_partition =</span> <span class="ot">TRUE</span>)</a> +<a class="sourceLine" id="cb756-10" data-line-number="10"></a> +<a class="sourceLine" id="cb756-11" data-line-number="11"><span class="co">## Step 6: Order cells</span></a> +<a class="sourceLine" id="cb756-12" data-line-number="12">cds <-<span class="st"> </span><span class="kw">order_cells</span>(cds, <span class="dt">root_cells =</span> <span class="kw">c</span>(<span class="st">"zy"</span>,<span class="st">"zy.1"</span>,<span class="st">"zy.2"</span>,<span class="st">"zy.3"</span>) )</a> +<a class="sourceLine" id="cb756-13" data-line-number="13"></a> +<a class="sourceLine" id="cb756-14" data-line-number="14"><span class="kw">plot_cells</span>(cds, <span class="dt">color_cells_by=</span><span class="st">"cell_type2"</span>, <span class="dt">graph_label_size =</span> <span class="dv">4</span>, <span class="dt">cell_size =</span> <span class="dv">2</span>,</a> +<a class="sourceLine" id="cb756-15" data-line-number="15"> <span class="dt">group_label_size =</span> <span class="dv">6</span>)<span class="op">+</span><span class="st"> </span><span class="kw">scale_color_manual</span>(<span class="dt">values =</span> my_color) </a></code></pre></div> <p><img src="pseudotime_files/figure-html/run_monocle3-2.png" width="90%" style="display: block; margin: auto;" /></p> -<div class="sourceCode" id="cb674"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb674-1" data-line-number="1"><span class="kw">plot_cells</span>(cds, <span class="dt">graph_label_size =</span> <span class="dv">6</span>, <span class="dt">cell_size =</span> <span class="dv">1</span>, <span class="dt">color_cells_by=</span><span class="st">"pseudotime"</span>,</a> -<a class="sourceLine" id="cb674-2" data-line-number="2"> <span class="dt">group_label_size =</span> <span class="dv">6</span>)</a></code></pre></div> +<div class="sourceCode" id="cb757"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb757-1" data-line-number="1"><span class="kw">plot_cells</span>(cds, <span class="dt">graph_label_size =</span> <span class="dv">6</span>, <span class="dt">cell_size =</span> <span class="dv">1</span>, <span class="dt">color_cells_by=</span><span class="st">"pseudotime"</span>,</a> +<a class="sourceLine" id="cb757-2" data-line-number="2"> <span class="dt">group_label_size =</span> <span class="dv">6</span>)</a></code></pre></div> <pre><code>## Cells aren't colored in a way that allows them to be grouped.</code></pre> <p><img src="pseudotime_files/figure-html/run_monocle3-3.png" width="90%" style="display: block; margin: auto;" /></p> -<div class="sourceCode" id="cb676"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb676-1" data-line-number="1">pdata_cds <-<span class="st"> </span><span class="kw">pData</span>(cds)</a> -<a class="sourceLine" id="cb676-2" data-line-number="2">pdata_cds<span class="op">$</span>pseudotime_monocle3 <-<span class="st"> </span>monocle3<span class="op">::</span><span class="kw">pseudotime</span>(cds)</a> -<a class="sourceLine" id="cb676-3" data-line-number="3"></a> -<a class="sourceLine" id="cb676-4" data-line-number="4"><span class="kw">ggplot</span>(<span class="kw">as.data.frame</span>(pdata_cds), </a> -<a class="sourceLine" id="cb676-5" data-line-number="5"> <span class="kw">aes</span>(<span class="dt">x =</span> pseudotime_monocle3, </a> -<a class="sourceLine" id="cb676-6" data-line-number="6"> <span class="dt">y =</span> cell_type2, <span class="dt">colour =</span> cell_type2)) <span class="op">+</span></a> -<a class="sourceLine" id="cb676-7" data-line-number="7"><span class="st"> </span><span class="kw">geom_quasirandom</span>(<span class="dt">groupOnX =</span> <span class="ot">FALSE</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb676-8" data-line-number="8"><span class="st"> </span><span class="kw">scale_color_manual</span>(<span class="dt">values =</span> my_color) <span class="op">+</span><span class="st"> </span><span class="kw">theme_classic</span>() <span class="op">+</span></a> -<a class="sourceLine" id="cb676-9" data-line-number="9"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"monocle3 pseudotime"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Timepoint"</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb676-10" data-line-number="10"><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"Cells ordered by monocle3 pseudotime"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb759"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb759-1" data-line-number="1">pdata_cds <-<span class="st"> </span><span class="kw">pData</span>(cds)</a> +<a class="sourceLine" id="cb759-2" data-line-number="2">pdata_cds<span class="op">$</span>pseudotime_monocle3 <-<span class="st"> </span>monocle3<span class="op">::</span><span class="kw">pseudotime</span>(cds)</a> +<a class="sourceLine" id="cb759-3" data-line-number="3"></a> +<a class="sourceLine" id="cb759-4" data-line-number="4"><span class="kw">ggplot</span>(<span class="kw">as.data.frame</span>(pdata_cds), </a> +<a class="sourceLine" id="cb759-5" data-line-number="5"> <span class="kw">aes</span>(<span class="dt">x =</span> pseudotime_monocle3, </a> +<a class="sourceLine" id="cb759-6" data-line-number="6"> <span class="dt">y =</span> cell_type2, <span class="dt">colour =</span> cell_type2)) <span class="op">+</span></a> +<a class="sourceLine" id="cb759-7" data-line-number="7"><span class="st"> </span><span class="kw">geom_quasirandom</span>(<span class="dt">groupOnX =</span> <span class="ot">FALSE</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb759-8" data-line-number="8"><span class="st"> </span><span class="kw">scale_color_manual</span>(<span class="dt">values =</span> my_color) <span class="op">+</span><span class="st"> </span><span class="kw">theme_classic</span>() <span class="op">+</span></a> +<a class="sourceLine" id="cb759-9" data-line-number="9"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"monocle3 pseudotime"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Timepoint"</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb759-10" data-line-number="10"><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"Cells ordered by monocle3 pseudotime"</span>)</a></code></pre></div> <p><img src="pseudotime_files/figure-html/run_monocle3-4.png" width="90%" style="display: block; margin: auto;" /></p> -<div class="sourceCode" id="cb677"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb677-1" data-line-number="1">deng_SCE<span class="op">$</span>pseudotime_monocle3 <-<span class="st"> </span>pdata_cds<span class="op">$</span>pseudotime_monocle3</a></code></pre></div> +<div class="sourceCode" id="cb760"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb760-1" data-line-number="1">deng_SCE<span class="op">$</span>pseudotime_monocle3 <-<span class="st"> </span>pdata_cds<span class="op">$</span>pseudotime_monocle3</a></code></pre></div> <p>It did not work well for our small Smart-seq2 dataset.</p> </div> <div id="diffusion-maps" class="section level3"> -<h3><span class="header-section-number">11.1.6</span> Diffusion maps</h3> +<h3><span class="header-section-number">11.1.7</span> Diffusion maps</h3> <p><a href="https://en.wikipedia.org/wiki/Diffusion_map">Diffusion maps</a> were introduced by <a href="http://www.sciencedirect.com/science/article/pii/S1063520306000546">Ronald Coifman and Stephane Lafon</a><span class="citation">(Coifman and Lafon <a href="#ref-Coifman2006-oa">2006</a>)</span>, and the underlying idea is to assume that the data are samples from a diffusion process. The method infers the low-dimensional manifold by estimating the eigenvalues and eigenvectors for the diffusion operator related to the data.</p> <p><a href="https://academic.oup.com/bioinformatics/article/32/8/1241/1744143">Angerer et al</a><span class="citation">(Angerer et al. <a href="#ref-Angerer2016-rn">2016</a>)</span> have applied the diffusion maps concept to the analysis of single-cell RNA-seq data to create an R package called <a href="http://bioconductor.org/packages/destiny">destiny</a>.</p> <p>We will take the ranko prder of cells in the first diffusion map component as “diffusion map pseudotime†here.</p> -<div class="sourceCode" id="cb678"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb678-1" data-line-number="1">deng <-<span class="st"> </span><span class="kw">logcounts</span>(deng_SCE)</a> -<a class="sourceLine" id="cb678-2" data-line-number="2"><span class="kw">colnames</span>(deng) <-<span class="st"> </span>cellLabels</a> -<a class="sourceLine" id="cb678-3" data-line-number="3">dm <-<span class="st"> </span><span class="kw">DiffusionMap</span>(<span class="kw">t</span>(deng))</a> -<a class="sourceLine" id="cb678-4" data-line-number="4"></a> -<a class="sourceLine" id="cb678-5" data-line-number="5">tmp <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="dt">DC1 =</span> <span class="kw">eigenvectors</span>(dm)[,<span class="dv">1</span>],</a> -<a class="sourceLine" id="cb678-6" data-line-number="6"> <span class="dt">DC2 =</span> <span class="kw">eigenvectors</span>(dm)[,<span class="dv">2</span>],</a> -<a class="sourceLine" id="cb678-7" data-line-number="7"> <span class="dt">Timepoint =</span> deng_SCE<span class="op">$</span>cell_type2)</a> -<a class="sourceLine" id="cb678-8" data-line-number="8"><span class="kw">ggplot</span>(tmp, <span class="kw">aes</span>(<span class="dt">x =</span> DC1, <span class="dt">y =</span> DC2, <span class="dt">colour =</span> Timepoint)) <span class="op">+</span></a> -<a class="sourceLine" id="cb678-9" data-line-number="9"><span class="st"> </span><span class="kw">geom_point</span>() <span class="op">+</span><span class="st"> </span><span class="kw">scale_color_manual</span>(<span class="dt">values =</span> my_color) <span class="op">+</span></a> -<a class="sourceLine" id="cb678-10" data-line-number="10"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"Diffusion component 1"</span>) <span class="op">+</span><span class="st"> </span></a> -<a class="sourceLine" id="cb678-11" data-line-number="11"><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Diffusion component 2"</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb678-12" data-line-number="12"><span class="st"> </span><span class="kw">theme_classic</span>()</a></code></pre></div> +<div class="sourceCode" id="cb761"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb761-1" data-line-number="1">deng <-<span class="st"> </span><span class="kw">logcounts</span>(deng_SCE)</a> +<a class="sourceLine" id="cb761-2" data-line-number="2"><span class="kw">colnames</span>(deng) <-<span class="st"> </span>cellLabels</a> +<a class="sourceLine" id="cb761-3" data-line-number="3">dm <-<span class="st"> </span><span class="kw">DiffusionMap</span>(<span class="kw">t</span>(deng))</a> +<a class="sourceLine" id="cb761-4" data-line-number="4"></a> +<a class="sourceLine" id="cb761-5" data-line-number="5">tmp <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="dt">DC1 =</span> <span class="kw">eigenvectors</span>(dm)[,<span class="dv">1</span>],</a> +<a class="sourceLine" id="cb761-6" data-line-number="6"> <span class="dt">DC2 =</span> <span class="kw">eigenvectors</span>(dm)[,<span class="dv">2</span>],</a> +<a class="sourceLine" id="cb761-7" data-line-number="7"> <span class="dt">Timepoint =</span> deng_SCE<span class="op">$</span>cell_type2)</a> +<a class="sourceLine" id="cb761-8" data-line-number="8"><span class="kw">ggplot</span>(tmp, <span class="kw">aes</span>(<span class="dt">x =</span> DC1, <span class="dt">y =</span> DC2, <span class="dt">colour =</span> Timepoint)) <span class="op">+</span></a> +<a class="sourceLine" id="cb761-9" data-line-number="9"><span class="st"> </span><span class="kw">geom_point</span>() <span class="op">+</span><span class="st"> </span><span class="kw">scale_color_manual</span>(<span class="dt">values =</span> my_color) <span class="op">+</span></a> +<a class="sourceLine" id="cb761-10" data-line-number="10"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"Diffusion component 1"</span>) <span class="op">+</span><span class="st"> </span></a> +<a class="sourceLine" id="cb761-11" data-line-number="11"><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Diffusion component 2"</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb761-12" data-line-number="12"><span class="st"> </span><span class="kw">theme_classic</span>()</a></code></pre></div> <p><img src="pseudotime_files/figure-html/destiny-deng-1.png" width="90%" style="display: block; margin: auto;" /></p> -<div class="sourceCode" id="cb679"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb679-1" data-line-number="1">deng_SCE<span class="op">$</span>pseudotime_diffusionmap <-<span class="st"> </span><span class="kw">rank</span>(<span class="kw">eigenvectors</span>(dm)[,<span class="dv">1</span>])</a> -<a class="sourceLine" id="cb679-2" data-line-number="2"></a> -<a class="sourceLine" id="cb679-3" data-line-number="3"><span class="kw">ggplot</span>(<span class="kw">as.data.frame</span>(<span class="kw">colData</span>(deng_SCE)), </a> -<a class="sourceLine" id="cb679-4" data-line-number="4"> <span class="kw">aes</span>(<span class="dt">x =</span> pseudotime_diffusionmap, </a> -<a class="sourceLine" id="cb679-5" data-line-number="5"> <span class="dt">y =</span> cell_type2, <span class="dt">colour =</span> cell_type2)) <span class="op">+</span></a> -<a class="sourceLine" id="cb679-6" data-line-number="6"><span class="st"> </span><span class="kw">geom_quasirandom</span>(<span class="dt">groupOnX =</span> <span class="ot">FALSE</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb679-7" data-line-number="7"><span class="st"> </span><span class="kw">scale_color_manual</span>(<span class="dt">values =</span> my_color) <span class="op">+</span><span class="st"> </span><span class="kw">theme_classic</span>() <span class="op">+</span></a> -<a class="sourceLine" id="cb679-8" data-line-number="8"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"Diffusion map pseudotime (first diffusion map component)"</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb679-9" data-line-number="9"><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Timepoint"</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb679-10" data-line-number="10"><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"Cells ordered by diffusion map pseudotime"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb762"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb762-1" data-line-number="1">deng_SCE<span class="op">$</span>pseudotime_diffusionmap <-<span class="st"> </span><span class="kw">rank</span>(<span class="kw">eigenvectors</span>(dm)[,<span class="dv">1</span>])</a> +<a class="sourceLine" id="cb762-2" data-line-number="2"></a> +<a class="sourceLine" id="cb762-3" data-line-number="3"><span class="kw">ggplot</span>(<span class="kw">as.data.frame</span>(<span class="kw">colData</span>(deng_SCE)), </a> +<a class="sourceLine" id="cb762-4" data-line-number="4"> <span class="kw">aes</span>(<span class="dt">x =</span> pseudotime_diffusionmap, </a> +<a class="sourceLine" id="cb762-5" data-line-number="5"> <span class="dt">y =</span> cell_type2, <span class="dt">colour =</span> cell_type2)) <span class="op">+</span></a> +<a class="sourceLine" id="cb762-6" data-line-number="6"><span class="st"> </span><span class="kw">geom_quasirandom</span>(<span class="dt">groupOnX =</span> <span class="ot">FALSE</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb762-7" data-line-number="7"><span class="st"> </span><span class="kw">scale_color_manual</span>(<span class="dt">values =</span> my_color) <span class="op">+</span><span class="st"> </span><span class="kw">theme_classic</span>() <span class="op">+</span></a> +<a class="sourceLine" id="cb762-8" data-line-number="8"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"Diffusion map pseudotime (first diffusion map component)"</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb762-9" data-line-number="9"><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Timepoint"</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb762-10" data-line-number="10"><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"Cells ordered by diffusion map pseudotime"</span>)</a></code></pre></div> <p><img src="pseudotime_files/figure-html/destiny-deng-2.png" width="90%" style="display: block; margin: auto;" /></p> <p>Like the other methods, using the first diffusion map component from destiny as pseudotime does a good job at ordering the early time-points (if we take high values as “earlier†in developement), but it is unable to distinguish the later ones.</p> <p><strong>Exercise 2</strong> Do you get a better resolution between the later time points by considering additional eigenvectors?</p> <p><strong>Exercise 3</strong> How does the ordering change if you only use the genes identified by M3Drop?</p> </div> <div id="other-methods" class="section level3"> -<h3><span class="header-section-number">11.1.7</span> Other methods</h3> +<h3><span class="header-section-number">11.1.8</span> Other methods</h3> <div id="slicer" class="section level4"> -<h4><span class="header-section-number">11.1.7.1</span> SLICER</h4> +<h4><span class="header-section-number">11.1.8.1</span> SLICER</h4> <p>The SLICER<span class="citation">(Welch, Hartemink, and Prins <a href="#ref-Welch2016-jr">2016</a>)</span> method is an algorithm for constructing trajectories that describe gene expression changes during a sequential biological process, just as Monocle and TSCAN are. SLICER is designed to capture @@ -906,9 +940,9 @@ than fluctuating randomly, across the set of cells. Following this, we determine which value of “k†(number of nearest neighbours) yields an embedding that most resembles a trajectory. Then we estimate the <a href="https://en.wikipedia.org/wiki/Nonlinear_dimensionality_reduction">locally linear embedding</a> of the cells.</p> -<div class="sourceCode" id="cb680"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb680-1" data-line-number="1"><span class="kw">library</span>(<span class="st">"lle"</span>)</a> -<a class="sourceLine" id="cb680-2" data-line-number="2">slicer_genes <-<span class="st"> </span><span class="kw">select_genes</span>(<span class="kw">t</span>(deng))</a> -<a class="sourceLine" id="cb680-3" data-line-number="3">k <-<span class="st"> </span><span class="kw">select_k</span>(<span class="kw">t</span>(deng[slicer_genes,]), <span class="dt">kmin =</span> <span class="dv">30</span>, <span class="dt">kmax=</span><span class="dv">60</span>)</a></code></pre></div> +<div class="sourceCode" id="cb763"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb763-1" data-line-number="1"><span class="kw">library</span>(<span class="st">"lle"</span>)</a> +<a class="sourceLine" id="cb763-2" data-line-number="2">slicer_genes <-<span class="st"> </span><span class="kw">select_genes</span>(<span class="kw">t</span>(deng))</a> +<a class="sourceLine" id="cb763-3" data-line-number="3">k <-<span class="st"> </span><span class="kw">select_k</span>(<span class="kw">t</span>(deng[slicer_genes,]), <span class="dt">kmin =</span> <span class="dv">30</span>, <span class="dt">kmax=</span><span class="dv">60</span>)</a></code></pre></div> <pre><code>## finding neighbours ## calculating weights ## computing coordinates @@ -930,22 +964,22 @@ embedding</a> of the cells.</p> ## finding neighbours ## calculating weights ## computing coordinates</code></pre> -<div class="sourceCode" id="cb682"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb682-1" data-line-number="1">slicer_traj_lle <-<span class="st"> </span><span class="kw">lle</span>(<span class="kw">t</span>(deng[slicer_genes,]), <span class="dt">m =</span> <span class="dv">2</span>, k)<span class="op">$</span>Y</a></code></pre></div> +<div class="sourceCode" id="cb765"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb765-1" data-line-number="1">slicer_traj_lle <-<span class="st"> </span><span class="kw">lle</span>(<span class="kw">t</span>(deng[slicer_genes,]), <span class="dt">m =</span> <span class="dv">2</span>, k)<span class="op">$</span>Y</a></code></pre></div> <pre><code>## finding neighbours ## calculating weights ## computing coordinates</code></pre> -<div class="sourceCode" id="cb684"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb684-1" data-line-number="1"><span class="kw">reducedDim</span>(deng_SCE, <span class="st">"LLE"</span>) <-<span class="st"> </span>slicer_traj_lle</a> -<a class="sourceLine" id="cb684-2" data-line-number="2"></a> -<a class="sourceLine" id="cb684-3" data-line-number="3">plot_df <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="dt">slicer1 =</span> <span class="kw">reducedDim</span>(deng_SCE, <span class="st">"LLE"</span>)[,<span class="dv">1</span>],</a> -<a class="sourceLine" id="cb684-4" data-line-number="4"> <span class="dt">slicer2 =</span> <span class="kw">reducedDim</span>(deng_SCE, <span class="st">"LLE"</span>)[,<span class="dv">2</span>],</a> -<a class="sourceLine" id="cb684-5" data-line-number="5"> <span class="dt">cell_type2 =</span> deng_SCE<span class="op">$</span>cell_type2)</a> -<a class="sourceLine" id="cb684-6" data-line-number="6"><span class="kw">ggplot</span>(<span class="dt">data =</span> plot_df)<span class="op">+</span><span class="kw">geom_point</span>(<span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">x =</span> slicer1, </a> -<a class="sourceLine" id="cb684-7" data-line-number="7"> <span class="dt">y =</span> slicer2, </a> -<a class="sourceLine" id="cb684-8" data-line-number="8"> <span class="dt">color =</span> cell_type2))<span class="op">+</span></a> -<a class="sourceLine" id="cb684-9" data-line-number="9"><span class="st"> </span><span class="kw">scale_color_manual</span>(<span class="dt">values =</span> my_color)<span class="op">+</span><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"LLE component 1"</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb684-10" data-line-number="10"><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"LLE component 2"</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb684-11" data-line-number="11"><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"Locally linear embedding of cells from SLICER"</span>)<span class="op">+</span></a> -<a class="sourceLine" id="cb684-12" data-line-number="12"><span class="st"> </span><span class="kw">theme_classic</span>()</a></code></pre></div> +<div class="sourceCode" id="cb767"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb767-1" data-line-number="1"><span class="kw">reducedDim</span>(deng_SCE, <span class="st">"LLE"</span>) <-<span class="st"> </span>slicer_traj_lle</a> +<a class="sourceLine" id="cb767-2" data-line-number="2"></a> +<a class="sourceLine" id="cb767-3" data-line-number="3">plot_df <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="dt">slicer1 =</span> <span class="kw">reducedDim</span>(deng_SCE, <span class="st">"LLE"</span>)[,<span class="dv">1</span>],</a> +<a class="sourceLine" id="cb767-4" data-line-number="4"> <span class="dt">slicer2 =</span> <span class="kw">reducedDim</span>(deng_SCE, <span class="st">"LLE"</span>)[,<span class="dv">2</span>],</a> +<a class="sourceLine" id="cb767-5" data-line-number="5"> <span class="dt">cell_type2 =</span> deng_SCE<span class="op">$</span>cell_type2)</a> +<a class="sourceLine" id="cb767-6" data-line-number="6"><span class="kw">ggplot</span>(<span class="dt">data =</span> plot_df)<span class="op">+</span><span class="kw">geom_point</span>(<span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">x =</span> slicer1, </a> +<a class="sourceLine" id="cb767-7" data-line-number="7"> <span class="dt">y =</span> slicer2, </a> +<a class="sourceLine" id="cb767-8" data-line-number="8"> <span class="dt">color =</span> cell_type2))<span class="op">+</span></a> +<a class="sourceLine" id="cb767-9" data-line-number="9"><span class="st"> </span><span class="kw">scale_color_manual</span>(<span class="dt">values =</span> my_color)<span class="op">+</span><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"LLE component 1"</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb767-10" data-line-number="10"><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"LLE component 2"</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb767-11" data-line-number="11"><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">"Locally linear embedding of cells from SLICER"</span>)<span class="op">+</span></a> +<a class="sourceLine" id="cb767-12" data-line-number="12"><span class="st"> </span><span class="kw">theme_classic</span>()</a></code></pre></div> <p><img src="pseudotime_files/figure-html/slicer-analyis-1.png" width="90%" style="display: block; margin: auto;" /></p> <p>With the locally linear embedding computed we can construct a k-nearest neighbour graph that is fully connected. This plot displays @@ -953,38 +987,38 @@ a (yellow) circle for each cell, with the cell ID number overlaid in blue. Here we show the graph computed using 10 nearest neighbours. Here, SLICER appears to detect one major trajectory with one branch.</p> -<div class="sourceCode" id="cb685"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb685-1" data-line-number="1">slicer_traj_graph <-<span class="st"> </span><span class="kw">conn_knn_graph</span>(slicer_traj_lle, <span class="dv">10</span>)</a> -<a class="sourceLine" id="cb685-2" data-line-number="2"><span class="kw">plot</span>(slicer_traj_graph, <span class="dt">main =</span> <span class="st">"Fully connected kNN graph from SLICER"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb768"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb768-1" data-line-number="1">slicer_traj_graph <-<span class="st"> </span><span class="kw">conn_knn_graph</span>(slicer_traj_lle, <span class="dv">10</span>)</a> +<a class="sourceLine" id="cb768-2" data-line-number="2"><span class="kw">plot</span>(slicer_traj_graph, <span class="dt">main =</span> <span class="st">"Fully connected kNN graph from SLICER"</span>)</a></code></pre></div> <p><img src="pseudotime_files/figure-html/slicer-build-graph-1.png" width="90%" style="display: block; margin: auto;" /></p> <p>From this graph we can identify “extreme†cells that are candidates for start/end cells in the trajectory.</p> -<div class="sourceCode" id="cb686"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb686-1" data-line-number="1">ends <-<span class="st"> </span><span class="kw">find_extreme_cells</span>(slicer_traj_graph, slicer_traj_lle)</a></code></pre></div> +<div class="sourceCode" id="cb769"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb769-1" data-line-number="1">ends <-<span class="st"> </span><span class="kw">find_extreme_cells</span>(slicer_traj_graph, slicer_traj_lle)</a></code></pre></div> <p><img src="pseudotime_files/figure-html/slicer-1.png" width="90%" style="display: block; margin: auto;" /></p> -<div class="sourceCode" id="cb687"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb687-1" data-line-number="1">start <-<span class="st"> </span>ends[<span class="dv">1</span>]</a></code></pre></div> +<div class="sourceCode" id="cb770"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb770-1" data-line-number="1">start <-<span class="st"> </span>ends[<span class="dv">1</span>]</a></code></pre></div> <p>Having defined a start cell we can order the cells in the estimated pseudotime.</p> -<div class="sourceCode" id="cb688"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb688-1" data-line-number="1">pseudotime_order_slicer <-<span class="st"> </span><span class="kw">cell_order</span>(slicer_traj_graph, start)</a> -<a class="sourceLine" id="cb688-2" data-line-number="2">branches <-<span class="st"> </span><span class="kw">assign_branches</span>(slicer_traj_graph, start)</a> -<a class="sourceLine" id="cb688-3" data-line-number="3"></a> -<a class="sourceLine" id="cb688-4" data-line-number="4">pseudotime_slicer <-</a> -<a class="sourceLine" id="cb688-5" data-line-number="5"><span class="st"> </span><span class="kw">data.frame</span>(</a> -<a class="sourceLine" id="cb688-6" data-line-number="6"> <span class="dt">Timepoint =</span> cellLabels,</a> -<a class="sourceLine" id="cb688-7" data-line-number="7"> <span class="dt">pseudotime =</span> <span class="ot">NA</span>,</a> -<a class="sourceLine" id="cb688-8" data-line-number="8"> <span class="dt">State =</span> branches</a> -<a class="sourceLine" id="cb688-9" data-line-number="9"> )</a> -<a class="sourceLine" id="cb688-10" data-line-number="10">pseudotime_slicer<span class="op">$</span>pseudotime[pseudotime_order_slicer] <-</a> -<a class="sourceLine" id="cb688-11" data-line-number="11"><span class="st"> </span><span class="dv">1</span><span class="op">:</span><span class="kw">length</span>(pseudotime_order_slicer)</a> -<a class="sourceLine" id="cb688-12" data-line-number="12">deng_SCE<span class="op">$</span>pseudotime_slicer <-<span class="st"> </span>pseudotime_slicer<span class="op">$</span>pseudotime</a></code></pre></div> +<div class="sourceCode" id="cb771"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb771-1" data-line-number="1">pseudotime_order_slicer <-<span class="st"> </span><span class="kw">cell_order</span>(slicer_traj_graph, start)</a> +<a class="sourceLine" id="cb771-2" data-line-number="2">branches <-<span class="st"> </span><span class="kw">assign_branches</span>(slicer_traj_graph, start)</a> +<a class="sourceLine" id="cb771-3" data-line-number="3"></a> +<a class="sourceLine" id="cb771-4" data-line-number="4">pseudotime_slicer <-</a> +<a class="sourceLine" id="cb771-5" data-line-number="5"><span class="st"> </span><span class="kw">data.frame</span>(</a> +<a class="sourceLine" id="cb771-6" data-line-number="6"> <span class="dt">Timepoint =</span> cellLabels,</a> +<a class="sourceLine" id="cb771-7" data-line-number="7"> <span class="dt">pseudotime =</span> <span class="ot">NA</span>,</a> +<a class="sourceLine" id="cb771-8" data-line-number="8"> <span class="dt">State =</span> branches</a> +<a class="sourceLine" id="cb771-9" data-line-number="9"> )</a> +<a class="sourceLine" id="cb771-10" data-line-number="10">pseudotime_slicer<span class="op">$</span>pseudotime[pseudotime_order_slicer] <-</a> +<a class="sourceLine" id="cb771-11" data-line-number="11"><span class="st"> </span><span class="dv">1</span><span class="op">:</span><span class="kw">length</span>(pseudotime_order_slicer)</a> +<a class="sourceLine" id="cb771-12" data-line-number="12">deng_SCE<span class="op">$</span>pseudotime_slicer <-<span class="st"> </span>pseudotime_slicer<span class="op">$</span>pseudotime</a></code></pre></div> <p>We can again compare the inferred pseudotime to the known sampling timepoints. SLICER does not provide a pseudotime value per se, just an ordering of cells.</p> -<div class="sourceCode" id="cb689"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb689-1" data-line-number="1"><span class="kw">ggplot</span>(<span class="kw">as.data.frame</span>(<span class="kw">colData</span>(deng_SCE)), </a> -<a class="sourceLine" id="cb689-2" data-line-number="2"> <span class="kw">aes</span>(<span class="dt">x =</span> pseudotime_slicer, </a> -<a class="sourceLine" id="cb689-3" data-line-number="3"> <span class="dt">y =</span> cell_type2, <span class="dt">colour =</span> cell_type2)) <span class="op">+</span></a> -<a class="sourceLine" id="cb689-4" data-line-number="4"><span class="st"> </span><span class="kw">geom_quasirandom</span>(<span class="dt">groupOnX =</span> <span class="ot">FALSE</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb689-5" data-line-number="5"><span class="st"> </span><span class="kw">scale_color_manual</span>(<span class="dt">values =</span> my_color) <span class="op">+</span><span class="st"> </span><span class="kw">theme_classic</span>() <span class="op">+</span></a> -<a class="sourceLine" id="cb689-6" data-line-number="6"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"SLICER pseudotime (cell ordering)"</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb689-7" data-line-number="7"><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Timepoint"</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb689-8" data-line-number="8"><span class="st"> </span><span class="kw">theme_classic</span>()</a></code></pre></div> +<div class="sourceCode" id="cb772"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb772-1" data-line-number="1"><span class="kw">ggplot</span>(<span class="kw">as.data.frame</span>(<span class="kw">colData</span>(deng_SCE)), </a> +<a class="sourceLine" id="cb772-2" data-line-number="2"> <span class="kw">aes</span>(<span class="dt">x =</span> pseudotime_slicer, </a> +<a class="sourceLine" id="cb772-3" data-line-number="3"> <span class="dt">y =</span> cell_type2, <span class="dt">colour =</span> cell_type2)) <span class="op">+</span></a> +<a class="sourceLine" id="cb772-4" data-line-number="4"><span class="st"> </span><span class="kw">geom_quasirandom</span>(<span class="dt">groupOnX =</span> <span class="ot">FALSE</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb772-5" data-line-number="5"><span class="st"> </span><span class="kw">scale_color_manual</span>(<span class="dt">values =</span> my_color) <span class="op">+</span><span class="st"> </span><span class="kw">theme_classic</span>() <span class="op">+</span></a> +<a class="sourceLine" id="cb772-6" data-line-number="6"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"SLICER pseudotime (cell ordering)"</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb772-7" data-line-number="7"><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Timepoint"</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb772-8" data-line-number="8"><span class="st"> </span><span class="kw">theme_classic</span>()</a></code></pre></div> <p><img src="pseudotime_files/figure-html/slicer-vs-truth-1.png" width="90%" style="display: block; margin: auto;" /></p> <p>Like the previous method, SLICER <span class="citation">(Welch, Hartemink, and Prins <a href="#ref-Welch2016-jr">2016</a>)</span> here provides a good ordering for the early time points. It places “16cell†cells before “8cell†cells, but provides better ordering for blast cells than many of the earlier methods.</p> @@ -994,7 +1028,7 @@ the call to <code>conn_knn_graph</code>?</p> of genes from those chosen by SLICER (e.g. the genes identified by M3Drop)?</p> </div> <div id="ouija" class="section level4"> -<h4><span class="header-section-number">11.1.7.2</span> Ouija</h4> +<h4><span class="header-section-number">11.1.8.2</span> Ouija</h4> <p>Ouija (<a href="http://kieranrcampbell.github.io/ouija/" class="uri">http://kieranrcampbell.github.io/ouija/</a>) takes a different approach from the pseudotime estimation methods we have looked at so far. Earlier methods have all been “unsupervisedâ€, which is to say that apart from perhaps selecting informative genes we do not supply the method with any prior information about how we expect certain genes or the trajectory as a whole to behave.</p> <p>Ouija, in contrast, is a probabilistic framework that allows for interpretable learning of single-cell pseudotimes using only small panels of marker genes. This method:</p> <ul> @@ -1011,15 +1045,15 @@ of genes from those chosen by SLICER (e.g. the genes identified by M3Drop)?</p> </ul> <p>With Ouija we can model genes as either exhibiting monotonic up or down regulation (known as switch-like behaviour), or transient behaviour where the gene briefly peaks. By default, Ouija assumes all genes exhibit switch-like behaviour (the authors assure us not to worry if we get it wrong - the noise model means incorrectly specifying a transient gene as switch-like has minimal effect).</p> <p>Here we can “cheat†a little and check that our selected marker genes do actually identify different timepoints of the differentiation process.</p> -<div class="sourceCode" id="cb690"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb690-1" data-line-number="1">ouija_markers_down <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"Dazl"</span>, <span class="st">"Rnf17"</span>, <span class="st">"Sycp3"</span>, <span class="st">"Fgf8"</span>, </a> -<a class="sourceLine" id="cb690-2" data-line-number="2"> <span class="st">"Egfr"</span>, <span class="st">"Bmp5"</span>, <span class="st">"Bmp15"</span>, <span class="st">"Pou5f1"</span>)</a> -<a class="sourceLine" id="cb690-3" data-line-number="3">ouija_markers_up <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"Creb3"</span>, <span class="st">"Gpx4"</span>, <span class="st">"Krt8"</span>, <span class="st">"Elf5"</span>, <span class="st">"Cdx2"</span>, </a> -<a class="sourceLine" id="cb690-4" data-line-number="4"> <span class="st">"Tdgf1"</span>, <span class="st">"Gdf3"</span>, <span class="st">"Eomes"</span>)</a> -<a class="sourceLine" id="cb690-5" data-line-number="5">ouija_markers_transient <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"Zscan4b"</span>, <span class="st">"Foxa1"</span>, <span class="st">"Prdm14"</span>, <span class="st">"Sox21"</span>)</a> -<a class="sourceLine" id="cb690-6" data-line-number="6">ouija_markers <-<span class="st"> </span><span class="kw">c</span>(ouija_markers_down, ouija_markers_up, </a> -<a class="sourceLine" id="cb690-7" data-line-number="7"> ouija_markers_transient)</a> -<a class="sourceLine" id="cb690-8" data-line-number="8"><span class="kw">plotExpression</span>(deng_SCE, ouija_markers, <span class="dt">x =</span> <span class="st">"cell_type2"</span>, <span class="dt">colour_by =</span> <span class="st">"cell_type2"</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb690-9" data-line-number="9"><span class="st"> </span><span class="kw">theme</span>(<span class="dt">axis.text.x =</span> <span class="kw">element_text</span>(<span class="dt">angle =</span> <span class="dv">60</span>, <span class="dt">hjust =</span> <span class="dv">1</span>))</a></code></pre></div> +<div class="sourceCode" id="cb773"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb773-1" data-line-number="1">ouija_markers_down <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"Dazl"</span>, <span class="st">"Rnf17"</span>, <span class="st">"Sycp3"</span>, <span class="st">"Fgf8"</span>, </a> +<a class="sourceLine" id="cb773-2" data-line-number="2"> <span class="st">"Egfr"</span>, <span class="st">"Bmp5"</span>, <span class="st">"Bmp15"</span>, <span class="st">"Pou5f1"</span>)</a> +<a class="sourceLine" id="cb773-3" data-line-number="3">ouija_markers_up <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"Creb3"</span>, <span class="st">"Gpx4"</span>, <span class="st">"Krt8"</span>, <span class="st">"Elf5"</span>, <span class="st">"Cdx2"</span>, </a> +<a class="sourceLine" id="cb773-4" data-line-number="4"> <span class="st">"Tdgf1"</span>, <span class="st">"Gdf3"</span>, <span class="st">"Eomes"</span>)</a> +<a class="sourceLine" id="cb773-5" data-line-number="5">ouija_markers_transient <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"Zscan4b"</span>, <span class="st">"Foxa1"</span>, <span class="st">"Prdm14"</span>, <span class="st">"Sox21"</span>)</a> +<a class="sourceLine" id="cb773-6" data-line-number="6">ouija_markers <-<span class="st"> </span><span class="kw">c</span>(ouija_markers_down, ouija_markers_up, </a> +<a class="sourceLine" id="cb773-7" data-line-number="7"> ouija_markers_transient)</a> +<a class="sourceLine" id="cb773-8" data-line-number="8"><span class="kw">plotExpression</span>(deng_SCE, ouija_markers, <span class="dt">x =</span> <span class="st">"cell_type2"</span>, <span class="dt">colour_by =</span> <span class="st">"cell_type2"</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb773-9" data-line-number="9"><span class="st"> </span><span class="kw">theme</span>(<span class="dt">axis.text.x =</span> <span class="kw">element_text</span>(<span class="dt">angle =</span> <span class="dv">60</span>, <span class="dt">hjust =</span> <span class="dv">1</span>))</a></code></pre></div> <p><img src="pseudotime_files/figure-html/ouija-response-type-1.png" width="90%" style="display: block; margin: auto;" /></p> <p>In order to fit the pseudotimes wesimply call <code>ouija</code>, passing in the expected response types. Note that if no response types are provided then they are all assumed to be switch-like by default, which we will do here. The input to Ouija can be a cell-by-gene matrix of non-negative expression values, or an ExpressionSet object, or, happily, by selecting the <code>logcounts</code> values from a SingleCellExperiment object.</p> <p>We can apply prior information about whether genes are up- or down-regulated across the differentiation process, and also provide prior information about when the switch in expression or a peak in expression is likely to occur.</p> @@ -1030,75 +1064,75 @@ of genes from those chosen by SLICER (e.g. the genes identified by M3Drop)?</p> </ul> <p>In general, HMC will provide more accurate inference with approximately correct posterior variance for all parameters. However, VB is orders of magnitude quicker than HMC and while it may underestimate posterior variance, the Ouija authors suggest that anecdotally it often performs as well as HMC for discovering posterior pseudotimes.</p> <p>To help the Ouija model, we provide it with prior information about the strength of switches for up- and down-regulated genes. By setting switch strength to -10 for down-regulated genes and 10 for up-regulated genes with a prior strength standard deviation of 0.5 we are telling the model that we are confident about the expected behaviour of these genes across the differentiation process.</p> -<div class="sourceCode" id="cb691"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb691-1" data-line-number="1"><span class="kw">options</span>(<span class="dt">mc.cores =</span> parallel<span class="op">::</span><span class="kw">detectCores</span>())</a> -<a class="sourceLine" id="cb691-2" data-line-number="2">response_type <-<span class="st"> </span><span class="kw">c</span>(<span class="kw">rep</span>(<span class="st">"switch"</span>, <span class="kw">length</span>(ouija_markers_down) <span class="op">+</span><span class="st"> </span></a> -<a class="sourceLine" id="cb691-3" data-line-number="3"><span class="st"> </span><span class="kw">length</span>(ouija_markers_up)), </a> -<a class="sourceLine" id="cb691-4" data-line-number="4"> <span class="kw">rep</span>(<span class="st">"transient"</span>, <span class="kw">length</span>(ouija_markers_transient)))</a> -<a class="sourceLine" id="cb691-5" data-line-number="5">switch_strengths <-<span class="st"> </span><span class="kw">c</span>(<span class="kw">rep</span>(<span class="op">-</span><span class="dv">10</span>, <span class="kw">length</span>(ouija_markers_down)),</a> -<a class="sourceLine" id="cb691-6" data-line-number="6"> <span class="kw">rep</span>(<span class="dv">10</span>, <span class="kw">length</span>(ouija_markers_up)))</a> -<a class="sourceLine" id="cb691-7" data-line-number="7">switch_strength_sd <-<span class="st"> </span><span class="kw">c</span>(<span class="kw">rep</span>(<span class="fl">0.5</span>, <span class="kw">length</span>(ouija_markers_down)),</a> -<a class="sourceLine" id="cb691-8" data-line-number="8"> <span class="kw">rep</span>(<span class="fl">0.5</span>, <span class="kw">length</span>(ouija_markers_up)))</a> -<a class="sourceLine" id="cb691-9" data-line-number="9">garbage <-<span class="st"> </span><span class="kw">capture.output</span>(</a> -<a class="sourceLine" id="cb691-10" data-line-number="10"> oui_vb <-<span class="st"> </span><span class="kw">ouija</span>(deng_SCE[ouija_markers,],</a> -<a class="sourceLine" id="cb691-11" data-line-number="11"> <span class="dt">single_cell_experiment_assay =</span> <span class="st">"logcounts"</span>, </a> -<a class="sourceLine" id="cb691-12" data-line-number="12"> <span class="dt">response_type =</span> response_type,</a> -<a class="sourceLine" id="cb691-13" data-line-number="13"> <span class="dt">switch_strengths =</span> switch_strengths,</a> -<a class="sourceLine" id="cb691-14" data-line-number="14"> <span class="dt">switch_strength_sd =</span> switch_strength_sd,</a> -<a class="sourceLine" id="cb691-15" data-line-number="15"> <span class="dt">inference_type =</span> <span class="st">"vb"</span>)</a> -<a class="sourceLine" id="cb691-16" data-line-number="16">)</a> -<a class="sourceLine" id="cb691-17" data-line-number="17"></a> -<a class="sourceLine" id="cb691-18" data-line-number="18"><span class="kw">print</span>(oui_vb)</a></code></pre></div> +<div class="sourceCode" id="cb774"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb774-1" data-line-number="1"><span class="kw">options</span>(<span class="dt">mc.cores =</span> parallel<span class="op">::</span><span class="kw">detectCores</span>())</a> +<a class="sourceLine" id="cb774-2" data-line-number="2">response_type <-<span class="st"> </span><span class="kw">c</span>(<span class="kw">rep</span>(<span class="st">"switch"</span>, <span class="kw">length</span>(ouija_markers_down) <span class="op">+</span><span class="st"> </span></a> +<a class="sourceLine" id="cb774-3" data-line-number="3"><span class="st"> </span><span class="kw">length</span>(ouija_markers_up)), </a> +<a class="sourceLine" id="cb774-4" data-line-number="4"> <span class="kw">rep</span>(<span class="st">"transient"</span>, <span class="kw">length</span>(ouija_markers_transient)))</a> +<a class="sourceLine" id="cb774-5" data-line-number="5">switch_strengths <-<span class="st"> </span><span class="kw">c</span>(<span class="kw">rep</span>(<span class="op">-</span><span class="dv">10</span>, <span class="kw">length</span>(ouija_markers_down)),</a> +<a class="sourceLine" id="cb774-6" data-line-number="6"> <span class="kw">rep</span>(<span class="dv">10</span>, <span class="kw">length</span>(ouija_markers_up)))</a> +<a class="sourceLine" id="cb774-7" data-line-number="7">switch_strength_sd <-<span class="st"> </span><span class="kw">c</span>(<span class="kw">rep</span>(<span class="fl">0.5</span>, <span class="kw">length</span>(ouija_markers_down)),</a> +<a class="sourceLine" id="cb774-8" data-line-number="8"> <span class="kw">rep</span>(<span class="fl">0.5</span>, <span class="kw">length</span>(ouija_markers_up)))</a> +<a class="sourceLine" id="cb774-9" data-line-number="9">garbage <-<span class="st"> </span><span class="kw">capture.output</span>(</a> +<a class="sourceLine" id="cb774-10" data-line-number="10"> oui_vb <-<span class="st"> </span><span class="kw">ouija</span>(deng_SCE[ouija_markers,],</a> +<a class="sourceLine" id="cb774-11" data-line-number="11"> <span class="dt">single_cell_experiment_assay =</span> <span class="st">"logcounts"</span>, </a> +<a class="sourceLine" id="cb774-12" data-line-number="12"> <span class="dt">response_type =</span> response_type,</a> +<a class="sourceLine" id="cb774-13" data-line-number="13"> <span class="dt">switch_strengths =</span> switch_strengths,</a> +<a class="sourceLine" id="cb774-14" data-line-number="14"> <span class="dt">switch_strength_sd =</span> switch_strength_sd,</a> +<a class="sourceLine" id="cb774-15" data-line-number="15"> <span class="dt">inference_type =</span> <span class="st">"vb"</span>)</a> +<a class="sourceLine" id="cb774-16" data-line-number="16">)</a> +<a class="sourceLine" id="cb774-17" data-line-number="17"></a> +<a class="sourceLine" id="cb774-18" data-line-number="18"><span class="kw">print</span>(oui_vb)</a></code></pre></div> <pre><code>## A Ouija fit with 268 cells and 20 marker genes ## Inference type: Variational Bayes ## (Gene behaviour) Switch/transient: 16 / 4</code></pre> <p>We can plot the gene expression over pseudotime along with the maximum a posteriori (MAP) estimates of the mean function (the sigmoid or Gaussian transient function) using the plot_expression function.</p> -<div class="sourceCode" id="cb693"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb693-1" data-line-number="1"><span class="kw">plot_expression</span>(oui_vb)</a></code></pre></div> +<div class="sourceCode" id="cb776"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb776-1" data-line-number="1"><span class="kw">plot_expression</span>(oui_vb)</a></code></pre></div> <p><img src="pseudotime_files/figure-html/ouija-plot-exprs-1.png" width="90%" style="display: block; margin: auto;" /></p> <p>We can also visualise when in the trajectory gene regulation behaviour occurs, either in the form of the switch time or the peak time (for switch-like or transient genes) using the plot_switch_times and plot_transient_times functions:</p> -<div class="sourceCode" id="cb694"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb694-1" data-line-number="1"><span class="kw">plot_switch_times</span>(oui_vb)</a></code></pre></div> +<div class="sourceCode" id="cb777"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb777-1" data-line-number="1"><span class="kw">plot_switch_times</span>(oui_vb)</a></code></pre></div> <p><img src="pseudotime_files/figure-html/ouija-plot-switch-times-1.png" width="90%" style="display: block; margin: auto;" /></p> -<div class="sourceCode" id="cb695"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb695-1" data-line-number="1"><span class="kw">plot_peak_times</span>(oui_vb)</a></code></pre></div> +<div class="sourceCode" id="cb778"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb778-1" data-line-number="1"><span class="kw">plot_peak_times</span>(oui_vb)</a></code></pre></div> <p><img src="pseudotime_files/figure-html/ouija-plot-switch-times-2.png" width="90%" style="display: block; margin: auto;" /></p> <p>Identify metastable states using consistency matrices.</p> -<div class="sourceCode" id="cb696"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb696-1" data-line-number="1">cmo <-<span class="st"> </span><span class="kw">consistency_matrix</span>(oui_vb)</a> -<a class="sourceLine" id="cb696-2" data-line-number="2"><span class="kw">plot_consistency</span>(oui_vb)</a></code></pre></div> +<div class="sourceCode" id="cb779"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb779-1" data-line-number="1">cmo <-<span class="st"> </span><span class="kw">consistency_matrix</span>(oui_vb)</a> +<a class="sourceLine" id="cb779-2" data-line-number="2"><span class="kw">plot_consistency</span>(oui_vb)</a></code></pre></div> <p><img src="pseudotime_files/figure-html/ouija-consistency-1.png" width="90%" style="display: block; margin: auto;" /></p> -<div class="sourceCode" id="cb697"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb697-1" data-line-number="1">cell_classifications <-<span class="st"> </span><span class="kw">cluster_consistency</span>(cmo)</a></code></pre></div> -<div class="sourceCode" id="cb698"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb698-1" data-line-number="1">map_pst <-<span class="st"> </span><span class="kw">map_pseudotime</span>(oui_vb)</a> -<a class="sourceLine" id="cb698-2" data-line-number="2">ouija_pseudotime <-<span class="st"> </span><span class="kw">data.frame</span>(map_pst, cell_classifications)</a> -<a class="sourceLine" id="cb698-3" data-line-number="3"></a> -<a class="sourceLine" id="cb698-4" data-line-number="4"><span class="kw">ggplot</span>(ouija_pseudotime, <span class="kw">aes</span>(<span class="dt">x =</span> map_pst, <span class="dt">y =</span> cell_classifications)) <span class="op">+</span></a> -<a class="sourceLine" id="cb698-5" data-line-number="5"><span class="st"> </span><span class="kw">geom_point</span>() <span class="op">+</span></a> -<a class="sourceLine" id="cb698-6" data-line-number="6"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"MAP pseudotime"</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb698-7" data-line-number="7"><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Cell classification"</span>)</a></code></pre></div> +<div class="sourceCode" id="cb780"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb780-1" data-line-number="1">cell_classifications <-<span class="st"> </span><span class="kw">cluster_consistency</span>(cmo)</a></code></pre></div> +<div class="sourceCode" id="cb781"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb781-1" data-line-number="1">map_pst <-<span class="st"> </span><span class="kw">map_pseudotime</span>(oui_vb)</a> +<a class="sourceLine" id="cb781-2" data-line-number="2">ouija_pseudotime <-<span class="st"> </span><span class="kw">data.frame</span>(map_pst, cell_classifications)</a> +<a class="sourceLine" id="cb781-3" data-line-number="3"></a> +<a class="sourceLine" id="cb781-4" data-line-number="4"><span class="kw">ggplot</span>(ouija_pseudotime, <span class="kw">aes</span>(<span class="dt">x =</span> map_pst, <span class="dt">y =</span> cell_classifications)) <span class="op">+</span></a> +<a class="sourceLine" id="cb781-5" data-line-number="5"><span class="st"> </span><span class="kw">geom_point</span>() <span class="op">+</span></a> +<a class="sourceLine" id="cb781-6" data-line-number="6"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"MAP pseudotime"</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb781-7" data-line-number="7"><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Cell classification"</span>)</a></code></pre></div> <p><img src="pseudotime_files/figure-html/ouija-pseudotime-1.png" width="90%" style="display: block; margin: auto;" /></p> -<div class="sourceCode" id="cb699"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb699-1" data-line-number="1">deng_SCE<span class="op">$</span>pseudotime_ouija <-<span class="st"> </span>ouija_pseudotime<span class="op">$</span>map_pst</a> -<a class="sourceLine" id="cb699-2" data-line-number="2">deng_SCE<span class="op">$</span>ouija_cell_class <-<span class="st"> </span>ouija_pseudotime<span class="op">$</span>cell_classifications</a> -<a class="sourceLine" id="cb699-3" data-line-number="3"></a> -<a class="sourceLine" id="cb699-4" data-line-number="4"><span class="kw">ggplot</span>(<span class="kw">as.data.frame</span>(<span class="kw">colData</span>(deng_SCE)), </a> -<a class="sourceLine" id="cb699-5" data-line-number="5"> <span class="kw">aes</span>(<span class="dt">x =</span> pseudotime_ouija, </a> -<a class="sourceLine" id="cb699-6" data-line-number="6"> <span class="dt">y =</span> cell_type2, <span class="dt">colour =</span> cell_type2)) <span class="op">+</span></a> -<a class="sourceLine" id="cb699-7" data-line-number="7"><span class="st"> </span><span class="kw">geom_quasirandom</span>(<span class="dt">groupOnX =</span> <span class="ot">FALSE</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb699-8" data-line-number="8"><span class="st"> </span><span class="kw">scale_color_manual</span>(<span class="dt">values =</span> my_color) <span class="op">+</span><span class="st"> </span><span class="kw">theme_classic</span>() <span class="op">+</span></a> -<a class="sourceLine" id="cb699-9" data-line-number="9"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"Ouija pseudotime"</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb699-10" data-line-number="10"><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Timepoint"</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb699-11" data-line-number="11"><span class="st"> </span><span class="kw">theme_classic</span>()</a></code></pre></div> +<div class="sourceCode" id="cb782"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb782-1" data-line-number="1">deng_SCE<span class="op">$</span>pseudotime_ouija <-<span class="st"> </span>ouija_pseudotime<span class="op">$</span>map_pst</a> +<a class="sourceLine" id="cb782-2" data-line-number="2">deng_SCE<span class="op">$</span>ouija_cell_class <-<span class="st"> </span>ouija_pseudotime<span class="op">$</span>cell_classifications</a> +<a class="sourceLine" id="cb782-3" data-line-number="3"></a> +<a class="sourceLine" id="cb782-4" data-line-number="4"><span class="kw">ggplot</span>(<span class="kw">as.data.frame</span>(<span class="kw">colData</span>(deng_SCE)), </a> +<a class="sourceLine" id="cb782-5" data-line-number="5"> <span class="kw">aes</span>(<span class="dt">x =</span> pseudotime_ouija, </a> +<a class="sourceLine" id="cb782-6" data-line-number="6"> <span class="dt">y =</span> cell_type2, <span class="dt">colour =</span> cell_type2)) <span class="op">+</span></a> +<a class="sourceLine" id="cb782-7" data-line-number="7"><span class="st"> </span><span class="kw">geom_quasirandom</span>(<span class="dt">groupOnX =</span> <span class="ot">FALSE</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb782-8" data-line-number="8"><span class="st"> </span><span class="kw">scale_color_manual</span>(<span class="dt">values =</span> my_color) <span class="op">+</span><span class="st"> </span><span class="kw">theme_classic</span>() <span class="op">+</span></a> +<a class="sourceLine" id="cb782-9" data-line-number="9"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"Ouija pseudotime"</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb782-10" data-line-number="10"><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Timepoint"</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb782-11" data-line-number="11"><span class="st"> </span><span class="kw">theme_classic</span>()</a></code></pre></div> <p><img src="pseudotime_files/figure-html/ouija-pseudotime-2.png" width="90%" style="display: block; margin: auto;" /></p> <p>Ouija does quite well in the ordering of the cells here, although it can be sensitive to the choice of marker genes and prior information supplied. How do the results change if you select different marker genes or change the priors?</p> <p>Ouija identifies four metastable states here, which we might annotate as “zygote/2cellâ€, “4/8/16 cellâ€, “blast1†and “blast2â€.</p> -<div class="sourceCode" id="cb700"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb700-1" data-line-number="1"><span class="kw">ggplot</span>(<span class="kw">as.data.frame</span>(<span class="kw">colData</span>(deng_SCE)), </a> -<a class="sourceLine" id="cb700-2" data-line-number="2"> <span class="kw">aes</span>(<span class="dt">x =</span> <span class="kw">as.factor</span>(ouija_cell_class), </a> -<a class="sourceLine" id="cb700-3" data-line-number="3"> <span class="dt">y =</span> pseudotime_ouija, <span class="dt">colour =</span> cell_type2)) <span class="op">+</span></a> -<a class="sourceLine" id="cb700-4" data-line-number="4"><span class="st"> </span><span class="kw">geom_boxplot</span>() <span class="op">+</span><span class="st"> </span></a> -<a class="sourceLine" id="cb700-5" data-line-number="5"><span class="st"> </span><span class="kw">coord_flip</span>() <span class="op">+</span></a> -<a class="sourceLine" id="cb700-6" data-line-number="6"><span class="st"> </span><span class="kw">scale_color_manual</span>(<span class="dt">values =</span> my_color) <span class="op">+</span><span class="st"> </span><span class="kw">theme_classic</span>() <span class="op">+</span></a> -<a class="sourceLine" id="cb700-7" data-line-number="7"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"Ouija cell classification"</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb700-8" data-line-number="8"><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Ouija pseudotime"</span>) <span class="op">+</span></a> -<a class="sourceLine" id="cb700-9" data-line-number="9"><span class="st"> </span><span class="kw">theme_classic</span>()</a></code></pre></div> +<div class="sourceCode" id="cb783"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb783-1" data-line-number="1"><span class="kw">ggplot</span>(<span class="kw">as.data.frame</span>(<span class="kw">colData</span>(deng_SCE)), </a> +<a class="sourceLine" id="cb783-2" data-line-number="2"> <span class="kw">aes</span>(<span class="dt">x =</span> <span class="kw">as.factor</span>(ouija_cell_class), </a> +<a class="sourceLine" id="cb783-3" data-line-number="3"> <span class="dt">y =</span> pseudotime_ouija, <span class="dt">colour =</span> cell_type2)) <span class="op">+</span></a> +<a class="sourceLine" id="cb783-4" data-line-number="4"><span class="st"> </span><span class="kw">geom_boxplot</span>() <span class="op">+</span><span class="st"> </span></a> +<a class="sourceLine" id="cb783-5" data-line-number="5"><span class="st"> </span><span class="kw">coord_flip</span>() <span class="op">+</span></a> +<a class="sourceLine" id="cb783-6" data-line-number="6"><span class="st"> </span><span class="kw">scale_color_manual</span>(<span class="dt">values =</span> my_color) <span class="op">+</span><span class="st"> </span><span class="kw">theme_classic</span>() <span class="op">+</span></a> +<a class="sourceLine" id="cb783-7" data-line-number="7"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"Ouija cell classification"</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb783-8" data-line-number="8"><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Ouija pseudotime"</span>) <span class="op">+</span></a> +<a class="sourceLine" id="cb783-9" data-line-number="9"><span class="st"> </span><span class="kw">theme_classic</span>()</a></code></pre></div> <p><img src="pseudotime_files/figure-html/ouija-states-1.png" width="90%" style="display: block; margin: auto;" /></p> <p>A common analysis is to work out the regulation orderings of genes. For example, is gene A upregulated before gene B? Does gene C peak before the downregulation of gene D? Ouija answers these questions in terms of a Bayesian hypothesis test of whether the difference in regulation timing (either switch time or peak time) is significantly different to 0. This is collated using the gene_regulation function.</p> -<div class="sourceCode" id="cb701"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb701-1" data-line-number="1">gene_regs <-<span class="st"> </span><span class="kw">gene_regulation</span>(oui_vb)</a> -<a class="sourceLine" id="cb701-2" data-line-number="2"><span class="kw">head</span>(gene_regs)</a></code></pre></div> +<div class="sourceCode" id="cb784"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb784-1" data-line-number="1">gene_regs <-<span class="st"> </span><span class="kw">gene_regulation</span>(oui_vb)</a> +<a class="sourceLine" id="cb784-2" data-line-number="2"><span class="kw">head</span>(gene_regs)</a></code></pre></div> <pre><code>## # A tibble: 6 x 7 ## # Groups: label, gene_A [6] ## label gene_A gene_B mean_difference lower_95 upper_95 significant @@ -1114,28 +1148,28 @@ of genes from those chosen by SLICER (e.g. the genes identified by M3Drop)?</p> </div> </div> <div id="comparison-of-the-methods" class="section level3"> -<h3><span class="header-section-number">11.1.8</span> Comparison of the methods</h3> +<h3><span class="header-section-number">11.1.9</span> Comparison of the methods</h3> <p>How do the trajectories inferred by TSCAN, Monocle, Diffusion Map, SLICER and Ouija compare?</p> <p>TSCAN and Diffusion Map methods get the trajectory the “wrong way roundâ€, so we’ll adjust that for these comparisons.</p> -<div class="sourceCode" id="cb703"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb703-1" data-line-number="1">df_pseudotime <-<span class="st"> </span><span class="kw">as.data.frame</span>(</a> -<a class="sourceLine" id="cb703-2" data-line-number="2"> <span class="kw">colData</span>(deng_SCE)[, <span class="kw">grep</span>(<span class="st">"pseudotime"</span>, <span class="kw">colnames</span>(<span class="kw">colData</span>(deng_SCE)))]</a> -<a class="sourceLine" id="cb703-3" data-line-number="3">)</a> -<a class="sourceLine" id="cb703-4" data-line-number="4"><span class="kw">colnames</span>(df_pseudotime) <-<span class="st"> </span><span class="kw">gsub</span>(<span class="st">"pseudotime_"</span>, <span class="st">""</span>, </a> -<a class="sourceLine" id="cb703-5" data-line-number="5"> <span class="kw">colnames</span>(df_pseudotime))</a> -<a class="sourceLine" id="cb703-6" data-line-number="6">df_pseudotime<span class="op">$</span>PC1 <-<span class="st"> </span><span class="kw">reducedDim</span>(deng_SCE,<span class="st">"PCA"</span>)[,<span class="dv">1</span>]</a> -<a class="sourceLine" id="cb703-7" data-line-number="7">df_pseudotime<span class="op">$</span>order_tscan <-<span class="st"> </span><span class="op">-</span>df_pseudotime<span class="op">$</span>order_tscan</a> -<a class="sourceLine" id="cb703-8" data-line-number="8"><span class="co">#df_pseudotime$diffusionmap <- df_pseudotime$diffusionmap</span></a> -<a class="sourceLine" id="cb703-9" data-line-number="9">df_pseudotime<span class="op">$</span>slingshot1 <-<span class="st"> </span><span class="kw">colData</span>(deng_SCE)<span class="op">$</span>slingPseudotime_<span class="dv">1</span></a> -<a class="sourceLine" id="cb703-10" data-line-number="10"></a> -<a class="sourceLine" id="cb703-11" data-line-number="11"><span class="kw">corrplot.mixed</span>(<span class="kw">cor</span>(df_pseudotime, <span class="dt">use =</span> <span class="st">"na.or.complete"</span>), </a> -<a class="sourceLine" id="cb703-12" data-line-number="12"> <span class="dt">order =</span> <span class="st">"hclust"</span>, <span class="dt">tl.col =</span> <span class="st">"black"</span>,</a> -<a class="sourceLine" id="cb703-13" data-line-number="13"> <span class="dt">main =</span> <span class="st">"Correlation matrix for pseudotime results"</span>,</a> -<a class="sourceLine" id="cb703-14" data-line-number="14"> <span class="dt">mar =</span> <span class="kw">c</span>(<span class="dv">0</span>, <span class="dv">0</span>, <span class="fl">3.1</span>, <span class="dv">0</span>))</a></code></pre></div> +<div class="sourceCode" id="cb786"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb786-1" data-line-number="1">df_pseudotime <-<span class="st"> </span><span class="kw">as.data.frame</span>(</a> +<a class="sourceLine" id="cb786-2" data-line-number="2"> <span class="kw">colData</span>(deng_SCE)[, <span class="kw">grep</span>(<span class="st">"pseudotime"</span>, <span class="kw">colnames</span>(<span class="kw">colData</span>(deng_SCE)))]</a> +<a class="sourceLine" id="cb786-3" data-line-number="3">)</a> +<a class="sourceLine" id="cb786-4" data-line-number="4"><span class="kw">colnames</span>(df_pseudotime) <-<span class="st"> </span><span class="kw">gsub</span>(<span class="st">"pseudotime_"</span>, <span class="st">""</span>, </a> +<a class="sourceLine" id="cb786-5" data-line-number="5"> <span class="kw">colnames</span>(df_pseudotime))</a> +<a class="sourceLine" id="cb786-6" data-line-number="6">df_pseudotime<span class="op">$</span>PC1 <-<span class="st"> </span><span class="kw">reducedDim</span>(deng_SCE,<span class="st">"PCA"</span>)[,<span class="dv">1</span>]</a> +<a class="sourceLine" id="cb786-7" data-line-number="7">df_pseudotime<span class="op">$</span>order_tscan <-<span class="st"> </span><span class="op">-</span>df_pseudotime<span class="op">$</span>order_tscan</a> +<a class="sourceLine" id="cb786-8" data-line-number="8"><span class="co">#df_pseudotime$diffusionmap <- df_pseudotime$diffusionmap</span></a> +<a class="sourceLine" id="cb786-9" data-line-number="9">df_pseudotime<span class="op">$</span>slingshot1 <-<span class="st"> </span><span class="kw">colData</span>(deng_SCE)<span class="op">$</span>slingPseudotime_<span class="dv">1</span></a> +<a class="sourceLine" id="cb786-10" data-line-number="10"></a> +<a class="sourceLine" id="cb786-11" data-line-number="11"><span class="kw">corrplot.mixed</span>(<span class="kw">cor</span>(df_pseudotime, <span class="dt">use =</span> <span class="st">"na.or.complete"</span>), </a> +<a class="sourceLine" id="cb786-12" data-line-number="12"> <span class="dt">order =</span> <span class="st">"hclust"</span>, <span class="dt">tl.col =</span> <span class="st">"black"</span>,</a> +<a class="sourceLine" id="cb786-13" data-line-number="13"> <span class="dt">main =</span> <span class="st">"Correlation matrix for pseudotime results"</span>,</a> +<a class="sourceLine" id="cb786-14" data-line-number="14"> <span class="dt">mar =</span> <span class="kw">c</span>(<span class="dv">0</span>, <span class="dv">0</span>, <span class="fl">3.1</span>, <span class="dv">0</span>))</a></code></pre></div> <p><img src="pseudotime_files/figure-html/compare-results-1.png" width="90%" style="display: block; margin: auto;" /></p> <p>We see here that Ouija, TSCAN and SLICER all give trajectories that are similar and strongly correlated with PC1. Diffusion Map is less strongly correlated with these methods, and Monocle gives very different results.</p> </div> <div id="expression-of-genes-through-time" class="section level3"> -<h3><span class="header-section-number">11.1.9</span> Expression of genes through time</h3> +<h3><span class="header-section-number">11.1.10</span> Expression of genes through time</h3> <p>Each package also enables the visualization of expression through pseudotime. Following individual genes is very helpful for identifying genes that play an important role in the differentiation process. We illustrate the procedure using the <code>Nanog</code> gene.</p> <p>We have added the pseudotime values computed with all methods here to the <code>colData</code> slot of an <code>SCE</code> object. Having done that, the full @@ -1144,77 +1178,77 @@ investigate relationships between gene expression, cell populations and pseudotime. This is particularly useful for the packages such as SLICER that do not provide plotting functions.</p> <p><strong>Principal components</strong></p> -<div class="sourceCode" id="cb704"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb704-1" data-line-number="1">deng_SCE<span class="op">$</span>PC1 <-<span class="st"> </span><span class="kw">reducedDim</span>(deng_SCE,<span class="st">"PCA"</span>)[,<span class="dv">1</span>]</a> -<a class="sourceLine" id="cb704-2" data-line-number="2"><span class="kw">plotExpression</span>(deng_SCE, <span class="st">"Nanog"</span>, <span class="dt">x =</span> <span class="st">"PC1"</span>, </a> -<a class="sourceLine" id="cb704-3" data-line-number="3"> <span class="dt">colour_by =</span> <span class="st">"cell_type2"</span>, <span class="dt">show_violin =</span> <span class="ot">FALSE</span>,</a> -<a class="sourceLine" id="cb704-4" data-line-number="4"> <span class="dt">show_smooth =</span> <span class="ot">TRUE</span>)</a></code></pre></div> +<div class="sourceCode" id="cb787"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb787-1" data-line-number="1">deng_SCE<span class="op">$</span>PC1 <-<span class="st"> </span><span class="kw">reducedDim</span>(deng_SCE,<span class="st">"PCA"</span>)[,<span class="dv">1</span>]</a> +<a class="sourceLine" id="cb787-2" data-line-number="2"><span class="kw">plotExpression</span>(deng_SCE, <span class="st">"Nanog"</span>, <span class="dt">x =</span> <span class="st">"PC1"</span>, </a> +<a class="sourceLine" id="cb787-3" data-line-number="3"> <span class="dt">colour_by =</span> <span class="st">"cell_type2"</span>, <span class="dt">show_violin =</span> <span class="ot">FALSE</span>,</a> +<a class="sourceLine" id="cb787-4" data-line-number="4"> <span class="dt">show_smooth =</span> <span class="ot">TRUE</span>)</a></code></pre></div> <p><img src="pseudotime_files/figure-html/Nanog-pc1-1.png" width="90%" style="display: block; margin: auto;" /></p> <p><strong>TSCAN</strong></p> -<div class="sourceCode" id="cb705"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb705-1" data-line-number="1"><span class="kw">plotExpression</span>(deng_SCE, <span class="st">"Nanog"</span>, <span class="dt">x =</span> <span class="st">"pseudotime_order_tscan"</span>, </a> -<a class="sourceLine" id="cb705-2" data-line-number="2"> <span class="dt">colour_by =</span> <span class="st">"cell_type2"</span>, <span class="dt">show_violin =</span> <span class="ot">FALSE</span>,</a> -<a class="sourceLine" id="cb705-3" data-line-number="3"> <span class="dt">show_smooth =</span> <span class="ot">TRUE</span>)</a></code></pre></div> +<div class="sourceCode" id="cb788"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb788-1" data-line-number="1"><span class="kw">plotExpression</span>(deng_SCE, <span class="st">"Nanog"</span>, <span class="dt">x =</span> <span class="st">"pseudotime_order_tscan"</span>, </a> +<a class="sourceLine" id="cb788-2" data-line-number="2"> <span class="dt">colour_by =</span> <span class="st">"cell_type2"</span>, <span class="dt">show_violin =</span> <span class="ot">FALSE</span>,</a> +<a class="sourceLine" id="cb788-3" data-line-number="3"> <span class="dt">show_smooth =</span> <span class="ot">TRUE</span>)</a></code></pre></div> <p><img src="pseudotime_files/figure-html/Nanog-tscan-1.png" width="90%" style="display: block; margin: auto;" /></p> <p><strong>Monocle</strong></p> -<div class="sourceCode" id="cb706"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb706-1" data-line-number="1"><span class="kw">plotExpression</span>(deng_SCE, <span class="st">"Nanog"</span>, <span class="dt">x =</span> <span class="st">"pseudotime_monocle2"</span>, </a> -<a class="sourceLine" id="cb706-2" data-line-number="2"> <span class="dt">colour_by =</span> <span class="st">"cell_type2"</span>, <span class="dt">show_violin =</span> <span class="ot">FALSE</span>,</a> -<a class="sourceLine" id="cb706-3" data-line-number="3"> <span class="dt">show_smooth =</span> <span class="ot">TRUE</span>)</a></code></pre></div> +<div class="sourceCode" id="cb789"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb789-1" data-line-number="1"><span class="kw">plotExpression</span>(deng_SCE, <span class="st">"Nanog"</span>, <span class="dt">x =</span> <span class="st">"pseudotime_monocle2"</span>, </a> +<a class="sourceLine" id="cb789-2" data-line-number="2"> <span class="dt">colour_by =</span> <span class="st">"cell_type2"</span>, <span class="dt">show_violin =</span> <span class="ot">FALSE</span>,</a> +<a class="sourceLine" id="cb789-3" data-line-number="3"> <span class="dt">show_smooth =</span> <span class="ot">TRUE</span>)</a></code></pre></div> <p><img src="pseudotime_files/figure-html/Nanog-monocle-1.png" width="90%" style="display: block; margin: auto;" /></p> <p><strong>Diffusion Map</strong></p> -<div class="sourceCode" id="cb707"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb707-1" data-line-number="1"><span class="kw">plotExpression</span>(deng_SCE, <span class="st">"Nanog"</span>, <span class="dt">x =</span> <span class="st">"pseudotime_diffusionmap"</span>, </a> -<a class="sourceLine" id="cb707-2" data-line-number="2"> <span class="dt">colour_by =</span> <span class="st">"cell_type2"</span>, <span class="dt">show_violin =</span> <span class="ot">FALSE</span>,</a> -<a class="sourceLine" id="cb707-3" data-line-number="3"> <span class="dt">show_smooth =</span> <span class="ot">TRUE</span>)</a></code></pre></div> +<div class="sourceCode" id="cb790"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb790-1" data-line-number="1"><span class="kw">plotExpression</span>(deng_SCE, <span class="st">"Nanog"</span>, <span class="dt">x =</span> <span class="st">"pseudotime_diffusionmap"</span>, </a> +<a class="sourceLine" id="cb790-2" data-line-number="2"> <span class="dt">colour_by =</span> <span class="st">"cell_type2"</span>, <span class="dt">show_violin =</span> <span class="ot">FALSE</span>,</a> +<a class="sourceLine" id="cb790-3" data-line-number="3"> <span class="dt">show_smooth =</span> <span class="ot">TRUE</span>)</a></code></pre></div> <p><img src="pseudotime_files/figure-html/Nanog-diff-map-1.png" width="90%" style="display: block; margin: auto;" /></p> <p><strong>SLICER</strong></p> -<div class="sourceCode" id="cb708"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb708-1" data-line-number="1"><span class="kw">plotExpression</span>(deng_SCE, <span class="st">"Nanog"</span>, <span class="dt">x =</span> <span class="st">"pseudotime_slicer"</span>, </a> -<a class="sourceLine" id="cb708-2" data-line-number="2"> <span class="dt">colour_by =</span> <span class="st">"cell_type2"</span>, <span class="dt">show_violin =</span> <span class="ot">FALSE</span>,</a> -<a class="sourceLine" id="cb708-3" data-line-number="3"> <span class="dt">show_smooth =</span> <span class="ot">TRUE</span>)</a></code></pre></div> +<div class="sourceCode" id="cb791"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb791-1" data-line-number="1"><span class="kw">plotExpression</span>(deng_SCE, <span class="st">"Nanog"</span>, <span class="dt">x =</span> <span class="st">"pseudotime_slicer"</span>, </a> +<a class="sourceLine" id="cb791-2" data-line-number="2"> <span class="dt">colour_by =</span> <span class="st">"cell_type2"</span>, <span class="dt">show_violin =</span> <span class="ot">FALSE</span>,</a> +<a class="sourceLine" id="cb791-3" data-line-number="3"> <span class="dt">show_smooth =</span> <span class="ot">TRUE</span>)</a></code></pre></div> <p><img src="pseudotime_files/figure-html/Nanog-slicer-1.png" width="90%" style="display: block; margin: auto;" /></p> <p><strong>Ouija</strong></p> -<div class="sourceCode" id="cb709"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb709-1" data-line-number="1"><span class="kw">plotExpression</span>(deng_SCE, <span class="st">"Nanog"</span>, <span class="dt">x =</span> <span class="st">"pseudotime_ouija"</span>, </a> -<a class="sourceLine" id="cb709-2" data-line-number="2"> <span class="dt">colour_by =</span> <span class="st">"cell_type2"</span>, <span class="dt">show_violin =</span> <span class="ot">FALSE</span>,</a> -<a class="sourceLine" id="cb709-3" data-line-number="3"> <span class="dt">show_smooth =</span> <span class="ot">TRUE</span>)</a></code></pre></div> +<div class="sourceCode" id="cb792"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb792-1" data-line-number="1"><span class="kw">plotExpression</span>(deng_SCE, <span class="st">"Nanog"</span>, <span class="dt">x =</span> <span class="st">"pseudotime_ouija"</span>, </a> +<a class="sourceLine" id="cb792-2" data-line-number="2"> <span class="dt">colour_by =</span> <span class="st">"cell_type2"</span>, <span class="dt">show_violin =</span> <span class="ot">FALSE</span>,</a> +<a class="sourceLine" id="cb792-3" data-line-number="3"> <span class="dt">show_smooth =</span> <span class="ot">TRUE</span>)</a></code></pre></div> <p><img src="pseudotime_files/figure-html/Nanog-ouija-1.png" width="90%" style="display: block; margin: auto;" /></p> <p>How many of these methods outperform the naive approach of using the first principal component to represent pseudotime for these data?</p> <p><strong>Exercise 7</strong>: Repeat the exercise using a subset of the genes, e.g. the set of highly variable genes that can be obtained using <code>Brennecke_getVariableGenes()</code></p> </div> <div id="dynverse" class="section level3"> -<h3><span class="header-section-number">11.1.10</span> dynverse</h3> +<h3><span class="header-section-number">11.1.11</span> dynverse</h3> <p><a href="https://dynverse.org/users/2-quick_start/" class="uri">https://dynverse.org/users/2-quick_start/</a></p> -<div class="sourceCode" id="cb710"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb710-1" data-line-number="1"><span class="kw">library</span>(dyno)</a> -<a class="sourceLine" id="cb710-2" data-line-number="2"><span class="kw">library</span>(tidyverse)</a> -<a class="sourceLine" id="cb710-3" data-line-number="3"></a> -<a class="sourceLine" id="cb710-4" data-line-number="4"><span class="co"># Reproduces the guidelines as created in the shiny app</span></a> -<a class="sourceLine" id="cb710-5" data-line-number="5">answers <-<span class="st"> </span>dynguidelines<span class="op">::</span><span class="kw">answer_questions</span>(</a> -<a class="sourceLine" id="cb710-6" data-line-number="6"> <span class="dt">multiple_disconnected =</span> <span class="ot">FALSE</span>, </a> -<a class="sourceLine" id="cb710-7" data-line-number="7"> <span class="dt">expect_topology =</span> <span class="ot">TRUE</span>, </a> -<a class="sourceLine" id="cb710-8" data-line-number="8"> <span class="dt">expected_topology =</span> <span class="st">"linear"</span>, </a> -<a class="sourceLine" id="cb710-9" data-line-number="9"> <span class="dt">n_cells =</span> <span class="dv">3000</span>, </a> -<a class="sourceLine" id="cb710-10" data-line-number="10"> <span class="dt">n_features =</span> <span class="dv">10000</span>, </a> -<a class="sourceLine" id="cb710-11" data-line-number="11"> <span class="dt">memory =</span> <span class="st">"100GB"</span>, </a> -<a class="sourceLine" id="cb710-12" data-line-number="12"> <span class="dt">docker =</span> <span class="ot">FALSE</span></a> -<a class="sourceLine" id="cb710-13" data-line-number="13">)</a> -<a class="sourceLine" id="cb710-14" data-line-number="14">guidelines <-<span class="st"> </span>dynguidelines<span class="op">::</span><span class="kw">guidelines</span>(<span class="dt">answers =</span> answers) </a> -<a class="sourceLine" id="cb710-15" data-line-number="15">guidelines</a> -<a class="sourceLine" id="cb710-16" data-line-number="16"></a> -<a class="sourceLine" id="cb710-17" data-line-number="17"></a> -<a class="sourceLine" id="cb710-18" data-line-number="18">deng_dataset <-<span class="st"> </span><span class="kw">wrap_expression</span>(</a> -<a class="sourceLine" id="cb710-19" data-line-number="19"> <span class="dt">counts =</span> <span class="kw">counts</span>(deng_SCE),</a> -<a class="sourceLine" id="cb710-20" data-line-number="20"> <span class="dt">expression =</span> <span class="kw">assay</span>(deng_SCE,<span class="st">"logcounts"</span>)</a> -<a class="sourceLine" id="cb710-21" data-line-number="21"> </a> -<a class="sourceLine" id="cb710-22" data-line-number="22">)</a> -<a class="sourceLine" id="cb710-23" data-line-number="23">model <-<span class="st"> </span><span class="kw">infer_trajectory</span>(deng_dataset, <span class="kw">first</span>(guidelines<span class="op">$</span>methods_selected))</a> -<a class="sourceLine" id="cb710-24" data-line-number="24"><span class="co">## Loading required namespace: hdf5r</span></a> -<a class="sourceLine" id="cb710-25" data-line-number="25"></a> -<a class="sourceLine" id="cb710-26" data-line-number="26">model <-<span class="st"> </span>model <span class="op">%>%</span><span class="st"> </span><span class="kw">add_dimred</span>(dyndimred<span class="op">::</span>dimred_mds, </a> -<a class="sourceLine" id="cb710-27" data-line-number="27"> <span class="dt">expression_source =</span> deng_dataset<span class="op">$</span>expression)</a> -<a class="sourceLine" id="cb710-28" data-line-number="28"><span class="kw">plot_dimred</span>(</a> -<a class="sourceLine" id="cb710-29" data-line-number="29"> model, </a> -<a class="sourceLine" id="cb710-30" data-line-number="30"> <span class="dt">expression_source =</span> deng_dataset<span class="op">$</span>expression, </a> -<a class="sourceLine" id="cb710-31" data-line-number="31"> <span class="dt">grouping =</span> deng_SCE<span class="op">$</span>cell_type2</a> -<a class="sourceLine" id="cb710-32" data-line-number="32">)</a></code></pre></div> +<div class="sourceCode" id="cb793"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb793-1" data-line-number="1"><span class="kw">library</span>(dyno)</a> +<a class="sourceLine" id="cb793-2" data-line-number="2"><span class="kw">library</span>(tidyverse)</a> +<a class="sourceLine" id="cb793-3" data-line-number="3"></a> +<a class="sourceLine" id="cb793-4" data-line-number="4"><span class="co"># Reproduces the guidelines as created in the shiny app</span></a> +<a class="sourceLine" id="cb793-5" data-line-number="5">answers <-<span class="st"> </span>dynguidelines<span class="op">::</span><span class="kw">answer_questions</span>(</a> +<a class="sourceLine" id="cb793-6" data-line-number="6"> <span class="dt">multiple_disconnected =</span> <span class="ot">FALSE</span>, </a> +<a class="sourceLine" id="cb793-7" data-line-number="7"> <span class="dt">expect_topology =</span> <span class="ot">TRUE</span>, </a> +<a class="sourceLine" id="cb793-8" data-line-number="8"> <span class="dt">expected_topology =</span> <span class="st">"linear"</span>, </a> +<a class="sourceLine" id="cb793-9" data-line-number="9"> <span class="dt">n_cells =</span> <span class="dv">3000</span>, </a> +<a class="sourceLine" id="cb793-10" data-line-number="10"> <span class="dt">n_features =</span> <span class="dv">10000</span>, </a> +<a class="sourceLine" id="cb793-11" data-line-number="11"> <span class="dt">memory =</span> <span class="st">"100GB"</span>, </a> +<a class="sourceLine" id="cb793-12" data-line-number="12"> <span class="dt">docker =</span> <span class="ot">FALSE</span></a> +<a class="sourceLine" id="cb793-13" data-line-number="13">)</a> +<a class="sourceLine" id="cb793-14" data-line-number="14">guidelines <-<span class="st"> </span>dynguidelines<span class="op">::</span><span class="kw">guidelines</span>(<span class="dt">answers =</span> answers) </a> +<a class="sourceLine" id="cb793-15" data-line-number="15">guidelines</a> +<a class="sourceLine" id="cb793-16" data-line-number="16"></a> +<a class="sourceLine" id="cb793-17" data-line-number="17"></a> +<a class="sourceLine" id="cb793-18" data-line-number="18">deng_dataset <-<span class="st"> </span><span class="kw">wrap_expression</span>(</a> +<a class="sourceLine" id="cb793-19" data-line-number="19"> <span class="dt">counts =</span> <span class="kw">counts</span>(deng_SCE),</a> +<a class="sourceLine" id="cb793-20" data-line-number="20"> <span class="dt">expression =</span> <span class="kw">assay</span>(deng_SCE,<span class="st">"logcounts"</span>)</a> +<a class="sourceLine" id="cb793-21" data-line-number="21"> </a> +<a class="sourceLine" id="cb793-22" data-line-number="22">)</a> +<a class="sourceLine" id="cb793-23" data-line-number="23">model <-<span class="st"> </span><span class="kw">infer_trajectory</span>(deng_dataset, <span class="kw">first</span>(guidelines<span class="op">$</span>methods_selected))</a> +<a class="sourceLine" id="cb793-24" data-line-number="24"><span class="co">## Loading required namespace: hdf5r</span></a> +<a class="sourceLine" id="cb793-25" data-line-number="25"></a> +<a class="sourceLine" id="cb793-26" data-line-number="26">model <-<span class="st"> </span>model <span class="op">%>%</span><span class="st"> </span><span class="kw">add_dimred</span>(dyndimred<span class="op">::</span>dimred_mds, </a> +<a class="sourceLine" id="cb793-27" data-line-number="27"> <span class="dt">expression_source =</span> deng_dataset<span class="op">$</span>expression)</a> +<a class="sourceLine" id="cb793-28" data-line-number="28"><span class="kw">plot_dimred</span>(</a> +<a class="sourceLine" id="cb793-29" data-line-number="29"> model, </a> +<a class="sourceLine" id="cb793-30" data-line-number="30"> <span class="dt">expression_source =</span> deng_dataset<span class="op">$</span>expression, </a> +<a class="sourceLine" id="cb793-31" data-line-number="31"> <span class="dt">grouping =</span> deng_SCE<span class="op">$</span>cell_type2</a> +<a class="sourceLine" id="cb793-32" data-line-number="32">)</a></code></pre></div> </div> <div id="sessioninfo-7" class="section level3"> -<h3><span class="header-section-number">11.1.11</span> sessionInfo()</h3> +<h3><span class="header-section-number">11.1.12</span> sessionInfo()</h3> <pre><code>## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS @@ -1240,135 +1274,136 @@ SLICER that do not provide plotting functions.</p> ## [3] lle_1.1 snowfall_1.84-6.1 ## [5] snow_0.4-3 MASS_7.3-51.1 ## [7] scatterplot3d_0.3-41 monocle3_0.2.0 -## [9] ouija_0.99.0 Rcpp_1.0.2 -## [11] SLICER_0.2.0 slingshot_1.2.0 -## [13] princurve_2.1.4 Polychrome_1.2.3 -## [15] corrplot_0.84 ggbeeswarm_0.6.0 -## [17] ggthemes_4.2.0 scater_1.12.2 -## [19] destiny_2.14.0 monocle_2.12.0 -## [21] DDRTree_0.1.5 irlba_2.3.3 -## [23] VGAM_1.1-1 ggplot2_3.2.1 -## [25] Matrix_1.2-17 M3Drop_1.10.0 -## [27] numDeriv_2016.8-1.1 TSCAN_1.22.0 -## [29] SingleCellExperiment_1.6.0 SummarizedExperiment_1.14.1 -## [31] DelayedArray_0.10.0 BiocParallel_1.18.1 -## [33] matrixStats_0.55.0 Biobase_2.44.0 -## [35] GenomicRanges_1.36.1 GenomeInfoDb_1.20.0 -## [37] IRanges_2.18.3 S4Vectors_0.22.1 -## [39] BiocGenerics_0.30.0 +## [9] gam_1.16.1 foreach_1.4.7 +## [11] ouija_0.99.0 Rcpp_1.0.2 +## [13] SLICER_0.2.0 slingshot_1.2.0 +## [15] princurve_2.1.4 Polychrome_1.2.3 +## [17] corrplot_0.84 ggbeeswarm_0.6.0 +## [19] ggthemes_4.2.0 scater_1.12.2 +## [21] destiny_2.14.0 monocle_2.12.0 +## [23] DDRTree_0.1.5 irlba_2.3.3 +## [25] VGAM_1.1-1 ggplot2_3.2.1 +## [27] Matrix_1.2-17 M3Drop_1.10.0 +## [29] numDeriv_2016.8-1.1 TSCAN_1.22.0 +## [31] SingleCellExperiment_1.6.0 SummarizedExperiment_1.14.1 +## [33] DelayedArray_0.10.0 BiocParallel_1.18.1 +## [35] matrixStats_0.55.0 Biobase_2.44.0 +## [37] GenomicRanges_1.36.1 GenomeInfoDb_1.20.0 +## [39] IRanges_2.18.3 S4Vectors_0.22.1 +## [41] BiocGenerics_0.30.0 ## ## loaded via a namespace (and not attached): ## [1] rgl_0.100.30 rsvd_1.0.2 ## [3] vcd_1.4-4 Hmisc_4.2-0 ## [5] zinbwave_1.6.0 corpcor_1.6.9 ## [7] ps_1.3.0 class_7.3-15 -## [9] foreach_1.4.7 lmtest_0.9-37 -## [11] glmnet_2.0-18 crayon_1.3.4 -## [13] laeken_0.5.0 nlme_3.1-139 -## [15] backports_1.1.4 qlcMatrix_0.9.7 -## [17] rlang_0.4.0 XVector_0.24.0 -## [19] readxl_1.3.1 callr_3.3.2 -## [21] limma_3.40.6 phylobase_0.8.6 -## [23] smoother_1.1 manipulateWidget_0.10.0 -## [25] bit64_0.9-7 loo_2.1.0 -## [27] glue_1.3.1 pheatmap_1.0.12 -## [29] rngtools_1.4 splancs_2.01-40 -## [31] processx_3.4.1 vipor_0.4.5 -## [33] AnnotationDbi_1.46.1 haven_2.1.1 -## [35] tidyselect_0.2.5 rio_0.5.16 -## [37] XML_3.98-1.20 tidyr_1.0.0 -## [39] zoo_1.8-6 xtable_1.8-4 -## [41] magrittr_1.5 evaluate_0.14 -## [43] bibtex_0.4.2 cli_1.1.0 -## [45] zlibbioc_1.30.0 rstudioapi_0.10 -## [47] miniUI_0.1.1.1 sp_1.3-1 -## [49] rpart_4.1-15 locfdr_1.1-8 -## [51] RcppEigen_0.3.3.5.0 shiny_1.3.2 -## [53] BiocSingular_1.0.0 xfun_0.9 -## [55] leidenbase_0.1.0 inline_0.3.15 -## [57] pkgbuild_1.0.5 cluster_2.1.0 -## [59] caTools_1.17.1.2 sgeostat_1.0-27 -## [61] tibble_2.1.3 ggrepel_0.8.1 -## [63] ape_5.3 stabledist_0.7-1 -## [65] zeallot_0.1.0 withr_2.1.2 -## [67] bitops_1.0-6 slam_0.1-45 -## [69] ranger_0.11.2 plyr_1.8.4 -## [71] cellranger_1.1.0 pcaPP_1.9-73 -## [73] sparsesvd_0.2 coda_0.19-3 -## [75] e1071_1.7-2 RcppParallel_4.4.3 -## [77] pillar_1.4.2 gplots_3.0.1.1 -## [79] reldist_1.6-6 kernlab_0.9-27 -## [81] TTR_0.23-5 ellipsis_0.3.0 -## [83] tripack_1.3-8 DelayedMatrixStats_1.6.1 -## [85] xts_0.11-2 vctrs_0.2.0 -## [87] NMF_0.21.0 tools_3.6.0 -## [89] foreign_0.8-70 rncl_0.8.3 -## [91] beeswarm_0.2.3 munsell_0.5.0 -## [93] proxy_0.4-23 HSMMSingleCell_1.4.0 -## [95] compiler_3.6.0 abind_1.4-5 -## [97] httpuv_1.5.2 pkgmaker_0.27 -## [99] GenomeInfoDbData_1.2.1 gridExtra_2.3 -## [101] edgeR_3.26.8 lattice_0.20-38 -## [103] deldir_0.1-23 utf8_1.1.4 -## [105] later_0.8.0 dplyr_0.8.3 -## [107] jsonlite_1.6 scales_1.0.0 -## [109] docopt_0.6.1 carData_3.0-2 -## [111] genefilter_1.66.0 lazyeval_0.2.2 -## [113] promises_1.0.1 spatstat_1.61-0 -## [115] car_3.0-3 doParallel_1.0.15 -## [117] latticeExtra_0.6-28 R.utils_2.9.0 -## [119] goftest_1.1-1 spatstat.utils_1.13-0 -## [121] checkmate_1.9.4 cowplot_1.0.0 -## [123] rmarkdown_1.15 openxlsx_4.1.0.1 -## [125] statmod_1.4.32 webshot_0.5.1 -## [127] Rtsne_0.15 forcats_0.4.0 -## [129] copula_0.999-19.1 softImpute_1.4 -## [131] uwot_0.1.4 igraph_1.2.4.1 -## [133] HDF5Array_1.12.2 survival_2.43-3 -## [135] yaml_2.2.0 htmltools_0.3.6 -## [137] memoise_1.1.0 locfit_1.5-9.1 -## [139] viridisLite_0.3.0 digest_0.6.21 -## [141] assertthat_0.2.1 mime_0.7 -## [143] densityClust_0.3 registry_0.5-1 -## [145] RSQLite_2.1.2 data.table_1.12.2 -## [147] blob_1.2.0 R.oo_1.22.0 -## [149] RNeXML_2.3.0 labeling_0.3 -## [151] fastICA_1.2-2 Formula_1.2-3 -## [153] Rhdf5lib_1.6.1 RCurl_1.95-4.12 -## [155] hms_0.5.1 rhdf5_2.28.0 -## [157] colorspace_1.4-1 base64enc_0.1-3 -## [159] nnet_7.3-12 ADGofTest_0.3 -## [161] mclust_5.4.5 bookdown_0.13 -## [163] RANN_2.6.1 mvtnorm_1.0-11 -## [165] fansi_0.4.0 pspline_1.0-18 -## [167] VIM_4.8.0 R6_2.4.0 -## [169] grid_3.6.0 lifecycle_0.1.0 -## [171] acepack_1.4.1 zip_2.0.4 -## [173] curl_4.2 gdata_2.18.0 -## [175] robustbase_0.93-5 howmany_0.3-1 -## [177] RcppAnnoy_0.0.13 RColorBrewer_1.1-2 -## [179] MCMCglmm_2.29 iterators_1.0.12 -## [181] alphahull_2.2 stringr_1.4.0 -## [183] htmlwidgets_1.3 polyclip_1.10-0 -## [185] purrr_0.3.2 crosstalk_1.0.0 -## [187] mgcv_1.8-28 tensorA_0.36.1 -## [189] htmlTable_1.13.2 clusterExperiment_2.4.4 -## [191] codetools_0.2-16 FNN_1.1.3 -## [193] gtools_3.8.1 prettyunits_1.0.2 -## [195] gridBase_0.4-7 RSpectra_0.15-0 -## [197] R.methodsS3_1.7.1 gtable_0.3.0 -## [199] DBI_1.0.0 highr_0.8 -## [201] tensor_1.5 httr_1.4.1 -## [203] KernSmooth_2.23-15 stringi_1.4.3 -## [205] progress_1.2.2 reshape2_1.4.3 -## [207] uuid_0.1-2 cubature_2.0.3 -## [209] annotate_1.62.0 viridis_0.5.1 -## [211] xml2_1.2.2 combinat_0.0-8 -## [213] bbmle_1.0.20 boot_1.3-20 -## [215] BiocNeighbors_1.2.0 ade4_1.7-13 -## [217] DEoptimR_1.0-8 bit_1.1-14 -## [219] spatstat.data_1.4-0 pkgconfig_2.0.3 -## [221] gsl_2.1-6 knitr_1.25</code></pre> +## [9] lmtest_0.9-37 glmnet_2.0-18 +## [11] crayon_1.3.4 laeken_0.5.0 +## [13] nlme_3.1-139 backports_1.1.4 +## [15] qlcMatrix_0.9.7 rlang_0.4.0 +## [17] XVector_0.24.0 readxl_1.3.1 +## [19] callr_3.3.2 limma_3.40.6 +## [21] phylobase_0.8.6 smoother_1.1 +## [23] manipulateWidget_0.10.0 bit64_0.9-7 +## [25] loo_2.1.0 glue_1.3.1 +## [27] pheatmap_1.0.12 rngtools_1.4 +## [29] splancs_2.01-40 processx_3.4.1 +## [31] vipor_0.4.5 AnnotationDbi_1.46.1 +## [33] haven_2.1.1 tidyselect_0.2.5 +## [35] rio_0.5.16 XML_3.98-1.20 +## [37] tidyr_1.0.0 zoo_1.8-6 +## [39] xtable_1.8-4 magrittr_1.5 +## [41] evaluate_0.14 bibtex_0.4.2 +## [43] cli_1.1.0 zlibbioc_1.30.0 +## [45] rstudioapi_0.10 miniUI_0.1.1.1 +## [47] sp_1.3-1 rpart_4.1-15 +## [49] locfdr_1.1-8 RcppEigen_0.3.3.5.0 +## [51] shiny_1.3.2 BiocSingular_1.0.0 +## [53] xfun_0.9 leidenbase_0.1.0 +## [55] inline_0.3.15 pkgbuild_1.0.5 +## [57] cluster_2.1.0 caTools_1.17.1.2 +## [59] sgeostat_1.0-27 tibble_2.1.3 +## [61] ggrepel_0.8.1 ape_5.3 +## [63] stabledist_0.7-1 zeallot_0.1.0 +## [65] withr_2.1.2 bitops_1.0-6 +## [67] slam_0.1-45 ranger_0.11.2 +## [69] plyr_1.8.4 cellranger_1.1.0 +## [71] pcaPP_1.9-73 sparsesvd_0.2 +## [73] coda_0.19-3 e1071_1.7-2 +## [75] RcppParallel_4.4.3 pillar_1.4.2 +## [77] gplots_3.0.1.1 reldist_1.6-6 +## [79] kernlab_0.9-27 TTR_0.23-5 +## [81] ellipsis_0.3.0 tripack_1.3-8 +## [83] DelayedMatrixStats_1.6.1 xts_0.11-2 +## [85] vctrs_0.2.0 NMF_0.21.0 +## [87] tools_3.6.0 foreign_0.8-70 +## [89] rncl_0.8.3 beeswarm_0.2.3 +## [91] munsell_0.5.0 proxy_0.4-23 +## [93] HSMMSingleCell_1.4.0 compiler_3.6.0 +## [95] abind_1.4-5 httpuv_1.5.2 +## [97] pkgmaker_0.27 GenomeInfoDbData_1.2.1 +## [99] gridExtra_2.3 edgeR_3.26.8 +## [101] lattice_0.20-38 deldir_0.1-23 +## [103] utf8_1.1.4 later_0.8.0 +## [105] dplyr_0.8.3 jsonlite_1.6 +## [107] scales_1.0.0 docopt_0.6.1 +## [109] carData_3.0-2 genefilter_1.66.0 +## [111] lazyeval_0.2.2 promises_1.0.1 +## [113] spatstat_1.61-0 car_3.0-3 +## [115] doParallel_1.0.15 latticeExtra_0.6-28 +## [117] R.utils_2.9.0 goftest_1.1-1 +## [119] spatstat.utils_1.13-0 checkmate_1.9.4 +## [121] cowplot_1.0.0 rmarkdown_1.15 +## [123] openxlsx_4.1.0.1 statmod_1.4.32 +## [125] webshot_0.5.1 Rtsne_0.15 +## [127] forcats_0.4.0 copula_0.999-19.1 +## [129] softImpute_1.4 uwot_0.1.4 +## [131] igraph_1.2.4.1 HDF5Array_1.12.2 +## [133] survival_2.43-3 yaml_2.2.0 +## [135] htmltools_0.3.6 memoise_1.1.0 +## [137] locfit_1.5-9.1 viridisLite_0.3.0 +## [139] digest_0.6.21 assertthat_0.2.1 +## [141] mime_0.7 densityClust_0.3 +## [143] registry_0.5-1 RSQLite_2.1.2 +## [145] data.table_1.12.2 blob_1.2.0 +## [147] R.oo_1.22.0 RNeXML_2.3.0 +## [149] labeling_0.3 fastICA_1.2-2 +## [151] Formula_1.2-3 Rhdf5lib_1.6.1 +## [153] RCurl_1.95-4.12 hms_0.5.1 +## [155] rhdf5_2.28.0 colorspace_1.4-1 +## [157] base64enc_0.1-3 nnet_7.3-12 +## [159] ADGofTest_0.3 mclust_5.4.5 +## [161] bookdown_0.13 RANN_2.6.1 +## [163] mvtnorm_1.0-11 fansi_0.4.0 +## [165] pspline_1.0-18 VIM_4.8.0 +## [167] R6_2.4.0 grid_3.6.0 +## [169] lifecycle_0.1.0 acepack_1.4.1 +## [171] zip_2.0.4 curl_4.2 +## [173] gdata_2.18.0 robustbase_0.93-5 +## [175] howmany_0.3-1 RcppAnnoy_0.0.13 +## [177] RColorBrewer_1.1-2 MCMCglmm_2.29 +## [179] iterators_1.0.12 alphahull_2.2 +## [181] stringr_1.4.0 htmlwidgets_1.3 +## [183] polyclip_1.10-0 purrr_0.3.2 +## [185] crosstalk_1.0.0 mgcv_1.8-28 +## [187] tensorA_0.36.1 htmlTable_1.13.2 +## [189] clusterExperiment_2.4.4 codetools_0.2-16 +## [191] FNN_1.1.3 gtools_3.8.1 +## [193] prettyunits_1.0.2 gridBase_0.4-7 +## [195] RSpectra_0.15-0 R.methodsS3_1.7.1 +## [197] gtable_0.3.0 DBI_1.0.0 +## [199] highr_0.8 tensor_1.5 +## [201] httr_1.4.1 KernSmooth_2.23-15 +## [203] stringi_1.4.3 progress_1.2.2 +## [205] reshape2_1.4.3 uuid_0.1-2 +## [207] cubature_2.0.3 annotate_1.62.0 +## [209] viridis_0.5.1 xml2_1.2.2 +## [211] combinat_0.0-8 bbmle_1.0.20 +## [213] boot_1.3-20 BiocNeighbors_1.2.0 +## [215] ade4_1.7-13 DEoptimR_1.0-8 +## [217] bit_1.1-14 spatstat.data_1.4-0 +## [219] pkgconfig_2.0.3 gsl_2.1-6 +## [221] knitr_1.25</code></pre> </div> </div>