diff --git a/public/clustering-and-cell-annotation.html b/public/clustering-and-cell-annotation.html
index a863e55364b9b1b2fee5ada94d5a46754cb2159e..ddbea5025c7a07bdb80e8e31177402d1479af1fe 100644
--- a/public/clustering-and-cell-annotation.html
+++ b/public/clustering-and-cell-annotation.html
@@ -878,7 +878,7 @@ parts, while if <span class="math inline">\(K\)</span> is too large, clusters mi
 <p><img src="clustering_files/figure-html/unnamed-chunk-10-1.png" width="672" style="display: block; margin: auto;" /></p>
 <p>Compare the results of <code>SC3</code> clustering with the original publication cell type labels:</p>
 <div class="sourceCode" id="cb470"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb470-1" data-line-number="1"><span class="kw">adjustedRandIndex</span>(<span class="kw">colData</span>(deng)<span class="op">$</span>cell_type2, <span class="kw">colData</span>(deng)<span class="op">$</span>sc3_<span class="dv">10</span>_clusters)</a></code></pre></div>
-<pre><code>## [1] 0.6581962</code></pre>
+<pre><code>## [1] 0.7804189</code></pre>
 <p><strong>Note</strong> <code>SC3</code> can also be run in an interactive <code>Shiny</code> session:</p>
 <div class="sourceCode" id="cb472"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb472-1" data-line-number="1"><span class="kw">sc3_interactive</span>(deng)</a></code></pre></div>
 <p>This command will open <code>SC3</code> in a web browser.</p>
@@ -989,11 +989,11 @@ This step has two outputs:</p>
 <pre><code>## [1]   50 2126</code></pre>
 <div class="sourceCode" id="cb485"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb485-1" data-line-number="1"><span class="kw">metadata</span>(sceM)<span class="op">$</span>scmap_cell_index<span class="op">$</span>subclusters[<span class="dv">1</span><span class="op">:</span><span class="dv">5</span>,<span class="dv">1</span><span class="op">:</span><span class="dv">5</span>]</a></code></pre></div>
 <pre><code>##      D28.1_1 D28.1_13 D28.1_15 D28.1_17 D28.1_2
-## [1,]      13       25       13        6      38
-## [2,]      23       46       36       25      11
-## [3,]      38       44       20       16      43
-## [4,]      18       36       32        4      33
-## [5,]       2       44       21       21      10</code></pre>
+## [1,]       8       14        5        3       3
+## [2,]      15       40        8       35       4
+## [3,]       7       43       21        1      20
+## [4,]      27       42       34       27      30
+## [5,]      45       38        4       41      36</code></pre>
 <p><strong>Projection:</strong> Once the scmap-cell indexes have been generated we can use them to project the test dataset.</p>
 <div class="sourceCode" id="cb487"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb487-1" data-line-number="1">scmapCell_results &lt;-<span class="st"> </span><span class="kw">scmapCell</span>(</a>
 <a class="sourceLine" id="cb487-2" data-line-number="2">  <span class="dt">projection =</span> segerstolpe,</a>
diff --git a/public/clustering.md b/public/clustering.md
index 7f8e583804817b0b9cc8278ae4b312057790e554..73c8417b36921ffa08477f46a0c9370fa41f0a4c 100644
--- a/public/clustering.md
+++ b/public/clustering.md
@@ -241,7 +241,7 @@ adjustedRandIndex(colData(deng)$cell_type2, colData(deng)$sc3_10_clusters)
 ```
 
 ```
-## [1] 0.6581962
+## [1] 0.7804189
 ```
 
 __Note__ `SC3` can also be run in an interactive `Shiny` session:
@@ -393,11 +393,11 @@ metadata(sceM)$scmap_cell_index$subclusters[1:5,1:5]
 
 ```
 ##      D28.1_1 D28.1_13 D28.1_15 D28.1_17 D28.1_2
-## [1,]      13       25       13        6      38
-## [2,]      23       46       36       25      11
-## [3,]      38       44       20       16      43
-## [4,]      18       36       32        4      33
-## [5,]       2       44       21       21      10
+## [1,]       8       14        5        3       3
+## [2,]      15       40        8       35       4
+## [3,]       7       43       21        1      20
+## [4,]      27       42       34       27      30
+## [5,]      45       38        4       41      36
 ```
 
 
diff --git a/public/clustering_files/figure-html/unnamed-chunk-10-1.png b/public/clustering_files/figure-html/unnamed-chunk-10-1.png
index cfde036e964ed0fd42762ab54d9c02501bff0fe0..606bc623f4b993cf76b4386c3da3366e8e300f09 100644
Binary files a/public/clustering_files/figure-html/unnamed-chunk-10-1.png and b/public/clustering_files/figure-html/unnamed-chunk-10-1.png differ
diff --git a/public/clustering_files/figure-html/unnamed-chunk-2-1.png b/public/clustering_files/figure-html/unnamed-chunk-2-1.png
index b870e4b7456197538e034e38dbea4f2cc18607fb..477c38944fe10b915adcdae98634a1b1fbe28716 100644
Binary files a/public/clustering_files/figure-html/unnamed-chunk-2-1.png and b/public/clustering_files/figure-html/unnamed-chunk-2-1.png differ
diff --git a/public/clustering_files/figure-html/unnamed-chunk-7-1.png b/public/clustering_files/figure-html/unnamed-chunk-7-1.png
index 1b4c7bebaf03c5599a9a5ce02f8446fdb03879d8..2f121cd13fede6c40c040f407f40827adc1a5136 100644
Binary files a/public/clustering_files/figure-html/unnamed-chunk-7-1.png and b/public/clustering_files/figure-html/unnamed-chunk-7-1.png differ
diff --git a/public/clustering_files/figure-html/unnamed-chunk-8-1.png b/public/clustering_files/figure-html/unnamed-chunk-8-1.png
index 698f63632fa0d2869c646aea7393d8656bdcbafb..8c3af151b21fed80e03f96dbfd5a10da049305fe 100644
Binary files a/public/clustering_files/figure-html/unnamed-chunk-8-1.png and b/public/clustering_files/figure-html/unnamed-chunk-8-1.png differ
diff --git a/public/clustering_files/figure-html/unnamed-chunk-9-1.png b/public/clustering_files/figure-html/unnamed-chunk-9-1.png
index 00b7ff7613cb94f507d9bd70d64dc9ba328e3c3a..090fe91aa6439d3f81b80839610d708896bead9d 100644
Binary files a/public/clustering_files/figure-html/unnamed-chunk-9-1.png and b/public/clustering_files/figure-html/unnamed-chunk-9-1.png differ
diff --git a/public/comparing-and-combining-scrna-seq-datasets.html b/public/comparing-and-combining-scrna-seq-datasets.html
index c18f87e8dfe631a693cef34833fbd1b5e9bec48c..6098841751713b7a2fe2310c4d1ee6f15a9852aa 100644
--- a/public/comparing-and-combining-scrna-seq-datasets.html
+++ b/public/comparing-and-combining-scrna-seq-datasets.html
@@ -754,12 +754,12 @@ annotations should be considered synonymous.</p>
 diagram</a>:</p>
 <div class="sourceCode" id="cb661"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb661-1" data-line-number="1"><span class="kw">plot</span>(<span class="kw">getSankey</span>(<span class="kw">colData</span>(muraro)<span class="op">$</span>cell_type1,  muraro_to_seger<span class="op">$</span>scmap_cluster_labs[,<span class="dv">1</span>], <span class="dt">plot_height=</span><span class="dv">400</span>))</a></code></pre></div>
 <!-- Sankey generated in R 3.6.0 by googleVis 0.6.4 package -->
-<!-- Thu Oct  3 08:23:58 2019 -->
+<!-- Thu Oct  3 10:17:31 2019 -->
 <!-- jsHeader -->
 <script type="text/javascript">
  
 // jsData 
-function gvisDataSankeyID626a3560a3f1 () {
+function gvisDataSankeyID8acc41e47c55 () {
 var data = new google.visualization.DataTable();
 var datajson =
 [
@@ -957,8 +957,8 @@ return(data);
 }
  
 // jsDrawChart
-function drawChartSankeyID626a3560a3f1() {
-var data = gvisDataSankeyID626a3560a3f1();
+function drawChartSankeyID8acc41e47c55() {
+var data = gvisDataSankeyID8acc41e47c55();
 var options = {};
 options["width"] = 400;
 options["height"] = 400;
@@ -977,7 +977,7 @@ options["sankey"] = {
             };
 
     var chart = new google.visualization.Sankey(
-    document.getElementById('SankeyID626a3560a3f1')
+    document.getElementById('SankeyID8acc41e47c55')
     );
     chart.draw(data,options);
     
@@ -1001,9 +1001,9 @@ if (newPackage)
   pkgs.push(chartid);
   
 // Add the drawChart function to the global list of callbacks
-callbacks.push(drawChartSankeyID626a3560a3f1);
+callbacks.push(drawChartSankeyID8acc41e47c55);
 })();
-function displayChartSankeyID626a3560a3f1() {
+function displayChartSankeyID8acc41e47c55() {
   var pkgs = window.__gvisPackages = window.__gvisPackages || [];
   var callbacks = window.__gvisCallbacks = window.__gvisCallbacks || [];
   window.clearTimeout(window.__gvisLoad);
@@ -1026,9 +1026,9 @@ callbacks.shift()();
 // jsFooter
 </script>
 <!-- jsChart -->
-<script type="text/javascript" src="https://www.google.com/jsapi?callback=displayChartSankeyID626a3560a3f1"></script>
+<script type="text/javascript" src="https://www.google.com/jsapi?callback=displayChartSankeyID8acc41e47c55"></script>
 <!-- divChart -->
-<div id="SankeyID626a3560a3f1" style="width: 400; height: 400;">
+<div id="SankeyID8acc41e47c55" style="width: 400; height: 400;">
 
 </div>
 <p><strong>Exercise</strong> How many of the previously unclassified cells would be be able to
@@ -1185,7 +1185,7 @@ pair of datasets at a time with CCA.</p>
 <p>Recently, the Seurat team proposed a new method to integrate single-cell
 datasets using “anchors” <span class="citation">(<span class="citeproc-not-found" data-reference-id="Stuart2019-zx"><strong>???</strong></span>)</span>.</p>
 <div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-35"></span>
-<img src="figures/stuart_graphical_abstract.jpg" alt="Seurat integration with anchors." width="90%" />
+<img src="figures/stuart_graphical_abstract.png" alt="Seurat integration with anchors." width="90%" />
 <p class="caption">
 Figure 14.2: Seurat integration with anchors.
 </p>
diff --git a/public/exprs-qc.md b/public/exprs-qc.md
index dbeb0293b43bae6c9d2354ef35e8ee062dbd5d76..535fe076907e8537dd17721da7e449710bc80806 100644
--- a/public/exprs-qc.md
+++ b/public/exprs-qc.md
@@ -465,39 +465,41 @@ umi = bcds(umi)
 ```
 
 ```
-## [1]	train-error:0.058012+0.009028	test-error:0.082759+0.011116 
+## [1]	train-error:0.066402+0.005931	test-error:0.103638+0.025177 
 ## Multiple eval metrics are present. Will use test_error for early stopping.
 ## Will train until test_error hasn't improved in 2 rounds.
 ## 
-## [2]	train-error:0.043255+0.006356	test-error:0.076937+0.011995 
-## [3]	train-error:0.033853+0.003265	test-error:0.064193+0.016507 
-## [4]	train-error:0.029800+0.006149	test-error:0.059020+0.009223 
-## [5]	train-error:0.024594+0.002186	test-error:0.053783+0.016495 
-## [6]	train-error:0.021410+0.002716	test-error:0.051489+0.011570 
-## [7]	train-error:0.017650+0.001547	test-error:0.052659+0.008845 
-## [8]	train-error:0.016202+0.002439	test-error:0.043973+0.006895 
-## [9]	train-error:0.014032+0.002261	test-error:0.046287+0.012793 
-## [10]	train-error:0.011573+0.001440	test-error:0.042244+0.010619 
-## [11]	train-error:0.009404+0.001444	test-error:0.039930+0.010749 
-## [12]	train-error:0.009837+0.001076	test-error:0.039930+0.010904 
-## [13]	train-error:0.008535+0.001904	test-error:0.038771+0.010294 
-## [14]	train-error:0.007523+0.001079	test-error:0.038775+0.010791 
-## [15]	train-error:0.006655+0.001675	test-error:0.036457+0.008707 
-## [16]	train-error:0.005497+0.000577	test-error:0.037616+0.006611 
-## [17]	train-error:0.004919+0.000542	test-error:0.037032+0.006923 
+## [2]	train-error:0.049912+0.002718	test-error:0.085687+0.015561 
+## [3]	train-error:0.036747+0.002381	test-error:0.079870+0.015979 
+## [4]	train-error:0.029948+0.002910	test-error:0.071214+0.015731 
+## [5]	train-error:0.026042+0.003332	test-error:0.072934+0.012966 
+## [6]	train-error:0.022424+0.002773	test-error:0.064263+0.011326 
+## [7]	train-error:0.019385+0.003075	test-error:0.058461+0.012930 
+## [8]	train-error:0.018084+0.001510	test-error:0.056722+0.009848 
+## [9]	train-error:0.016203+0.002440	test-error:0.052094+0.012606 
+## [10]	train-error:0.013310+0.003702	test-error:0.049775+0.013542 
+## [11]	train-error:0.011574+0.002856	test-error:0.046891+0.012536 
+## [12]	train-error:0.010705+0.001959	test-error:0.046302+0.011468 
+## [13]	train-error:0.009693+0.001081	test-error:0.043413+0.012598 
+## [14]	train-error:0.008826+0.000545	test-error:0.042254+0.011269 
+## [15]	train-error:0.007813+0.000709	test-error:0.040515+0.011465 
+## [16]	train-error:0.006655+0.000705	test-error:0.042249+0.011692 
+## [17]	train-error:0.005498+0.000584	test-error:0.042254+0.012804 
 ## Stopping. Best iteration:
-## [15]	train-error:0.006655+0.001675	test-error:0.036457+0.008707
+## [15]	train-error:0.007813+0.000709	test-error:0.040515+0.011465
 ## 
-## [1]	train-error:0.054977 
+## [1]	train-error:0.068866 
 ## Will train until train_error hasn't improved in 2 rounds.
 ## 
-## [2]	train-error:0.039931 
-## [3]	train-error:0.028356 
-## [4]	train-error:0.029514 
-## [5]	train-error:0.023148 
-## [6]	train-error:0.018519 
-## [7]	train-error:0.018519 
-## [8]	train-error:0.015625
+## [2]	train-error:0.038194 
+## [3]	train-error:0.034144 
+## [4]	train-error:0.026620 
+## [5]	train-error:0.024884 
+## [6]	train-error:0.020255 
+## [7]	train-error:0.020255 
+## [8]	train-error:0.019097 
+## [9]	train-error:0.016782 
+## [10]	train-error:0.013310
 ```
 
 ```r
@@ -511,12 +513,12 @@ head(cbind(CD$cxds_score,CD$bcds_score, CD$hybrid_score))
 
 ```
 ##                    [,1]        [,2]      [,3]
-## NA19098.r1.A01 4131.405 0.020812672 0.2531013
-## NA19098.r1.A02 4564.089 0.010029603 0.2674993
-## NA19098.r1.A03 2827.904 0.005551378 0.1611125
-## NA19098.r1.A04 4708.213 0.005341432 0.2711787
-## NA19098.r1.A05 6134.590 0.005762757 0.3552646
-## NA19098.r1.A06 5810.730 0.010443962 0.3410366
+## NA19098.r1.A01 4131.405 0.012810320 0.2455279
+## NA19098.r1.A02 4564.089 0.004930826 0.2628488
+## NA19098.r1.A03 2827.904 0.020504847 0.1769445
+## NA19098.r1.A04 4708.213 0.009793874 0.2762736
+## NA19098.r1.A05 6134.590 0.006487557 0.3565501
+## NA19098.r1.A06 5810.730 0.008279418 0.3393878
 ```
 
 ```r
diff --git a/public/exprs-qc_files/figure-html/unnamed-chunk-15-1.png b/public/exprs-qc_files/figure-html/unnamed-chunk-15-1.png
index d7362d28867f0d414acebd1cdb4d8581b522213f..52728bc0d4f554d522f75c14a6a69716e29978c9 100644
Binary files a/public/exprs-qc_files/figure-html/unnamed-chunk-15-1.png and b/public/exprs-qc_files/figure-html/unnamed-chunk-15-1.png differ
diff --git a/public/exprs-qc_files/figure-html/unnamed-chunk-16-1.png b/public/exprs-qc_files/figure-html/unnamed-chunk-16-1.png
index 6973f5f8e28a563528019385d2055f8ec58c1c19..32d32f6ec55e70b9fcefb13e5f43701354dd6aec 100644
Binary files a/public/exprs-qc_files/figure-html/unnamed-chunk-16-1.png and b/public/exprs-qc_files/figure-html/unnamed-chunk-16-1.png differ
diff --git a/public/figures/data-integration-fig1.png b/public/figures/data-integration-fig1.png
new file mode 100644
index 0000000000000000000000000000000000000000..ba09dbe7d3cac712afb9f5f2eb40bf5c2db062ab
Binary files /dev/null and b/public/figures/data-integration-fig1.png differ
diff --git a/public/figures/data-integration-tab1.png b/public/figures/data-integration-tab1.png
new file mode 100644
index 0000000000000000000000000000000000000000..bd2e429ce2ff15c29092caec25053081a80c3cef
Binary files /dev/null and b/public/figures/data-integration-tab1.png differ
diff --git a/public/figures/stuart_graphical_abstract.png b/public/figures/stuart_graphical_abstract.png
new file mode 100644
index 0000000000000000000000000000000000000000..97d73a0e41dfad3a1b996f631e474eae8e87003e
Binary files /dev/null and b/public/figures/stuart_graphical_abstract.png differ
diff --git a/public/intro-to-R.md b/public/intro-to-R.md
index 2f88c22e265e533ee24b3a87cd2df40ad8d5ec0e..0a0962b531420fea8734b55adeec52a7b5295709 100644
--- a/public/intro-to-R.md
+++ b/public/intro-to-R.md
@@ -562,7 +562,7 @@ ll
 ## $even_a_function
 ## function (..., deparse.level = 1) 
 ## .Internal(cbind(deparse.level, ...))
-## <bytecode: 0x55dd34ac6108>
+## <bytecode: 0x55ef7883b0f8>
 ## <environment: namespace:base>
 ```
 
diff --git a/public/introduction-to-rbioconductor.html b/public/introduction-to-rbioconductor.html
index abb3eed994c29ce5883f520c6d714cc56003920c..68c11181585d88291fb2961a93710dd175f226db 100644
--- a/public/introduction-to-rbioconductor.html
+++ b/public/introduction-to-rbioconductor.html
@@ -757,7 +757,7 @@ If we combine a character vector and a numeric vector into a matrix, all the dat
 ## $even_a_function
 ## function (..., deparse.level = 1) 
 ## .Internal(cbind(deparse.level, ...))
-## &lt;bytecode: 0x55dd34ac6108&gt;
+## &lt;bytecode: 0x55ef7883b0f8&gt;
 ## &lt;environment: namespace:base&gt;</code></pre>
 <p>Lists are most commonly used when returning a large number of results from a function that do not fit into any of the previous data structures.</p>
 </div>
diff --git a/public/latent-spaces.html b/public/latent-spaces.html
index d0ee5d97bbba37e70cf8e57077b71b983670f63e..a6d02190cd2614bd29b8fcb22d2fa526b1714305 100644
--- a/public/latent-spaces.html
+++ b/public/latent-spaces.html
@@ -648,7 +648,7 @@ traditional PCA, which is identical to the result from <code>plotPCA</code>.</p>
 <a class="sourceLine" id="cb425-3" data-line-number="3">Y &lt;-<span class="st"> </span>Y[<span class="kw">rowSums</span>(Y) <span class="op">&gt;</span><span class="st"> </span><span class="dv">0</span>, ]</a>
 <a class="sourceLine" id="cb425-4" data-line-number="4"><span class="kw">system.time</span>(res1 &lt;-<span class="st"> </span><span class="kw">glmpca</span>(Y, <span class="dt">L=</span><span class="dv">2</span>, <span class="dt">fam=</span><span class="st">&quot;poi&quot;</span>, <span class="dt">verbose=</span><span class="ot">TRUE</span>))</a></code></pre></div>
 <pre><code>##    user  system elapsed 
-##  87.515  23.205 110.738</code></pre>
+##  84.147  22.275 106.448</code></pre>
 <div class="sourceCode" id="cb427"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb427-1" data-line-number="1">pd1 &lt;-<span class="st"> </span><span class="kw">data.frame</span>(res1<span class="op">$</span>factors, <span class="dt">dimreduce=</span><span class="st">&quot;glmpca-poisson&quot;</span>, <span class="dt">clust =</span> <span class="kw">factor</span>(deng<span class="op">$</span>cell_type2))</a>
 <a class="sourceLine" id="cb427-2" data-line-number="2"><span class="co">## traditional PCA</span></a>
 <a class="sourceLine" id="cb427-3" data-line-number="3">pd2 &lt;-<span class="st"> </span><span class="kw">data.frame</span>(<span class="kw">reducedDim</span>(deng, <span class="st">&quot;PCA&quot;</span>), <span class="dt">dimreduce=</span><span class="st">&quot;runPCA&quot;</span>, <span class="dt">clust =</span> <span class="kw">factor</span>(deng<span class="op">$</span>cell_type2))</a>
diff --git a/public/latent-spaces.md b/public/latent-spaces.md
index 6f627311e697c3f502aaa201b0a5313104b7ef42..e250320ee8620e2428d8b91868355aec98e265e7 100644
--- a/public/latent-spaces.md
+++ b/public/latent-spaces.md
@@ -156,7 +156,7 @@ system.time(res1 <- glmpca(Y, L=2, fam="poi", verbose=TRUE))
 
 ```
 ##    user  system elapsed 
-##  87.515  23.205 110.738
+##  84.147  22.275 106.448
 ```
 
 ```r
diff --git a/public/latent-spaces_files/figure-html/glmpca-1.png b/public/latent-spaces_files/figure-html/glmpca-1.png
index e05edbea012a9c15176f2bbff87dbb6834ddcd3a..be5d6c3acf5ee5f37a06f36f577b8bbc2217090a 100644
Binary files a/public/latent-spaces_files/figure-html/glmpca-1.png and b/public/latent-spaces_files/figure-html/glmpca-1.png differ
diff --git a/public/latent-spaces_files/figure-html/tsne-1.png b/public/latent-spaces_files/figure-html/tsne-1.png
index ce27ab68d59e377760ad54906f2781650aad4e1a..2c5d359fba3de0dc7a36189dc6fe39327db7e8d8 100644
Binary files a/public/latent-spaces_files/figure-html/tsne-1.png and b/public/latent-spaces_files/figure-html/tsne-1.png differ
diff --git a/public/latent-spaces_files/figure-html/tsne-2.png b/public/latent-spaces_files/figure-html/tsne-2.png
index 705a7578a32c0f87b5cec5a779587f191532f706..3449cc4b702f9275931fd30422ad0923a110c88e 100644
Binary files a/public/latent-spaces_files/figure-html/tsne-2.png and b/public/latent-spaces_files/figure-html/tsne-2.png differ
diff --git a/public/latent-spaces_files/figure-html/umap-1.png b/public/latent-spaces_files/figure-html/umap-1.png
index 19be660162ad8199a50b5bf7a7ef289045ceab4e..ff7b0af0010e56c33f72f6e4a420b7b529ebc79a 100644
Binary files a/public/latent-spaces_files/figure-html/umap-1.png and b/public/latent-spaces_files/figure-html/umap-1.png differ
diff --git a/public/latent-spaces_files/figure-html/unnamed-chunk-1-1.png b/public/latent-spaces_files/figure-html/unnamed-chunk-1-1.png
index e003203679fe857861d50a8a3e1e0502937a9201..e7da4b8d889de4ec5767a632e1e96e4d9d124e7b 100644
Binary files a/public/latent-spaces_files/figure-html/unnamed-chunk-1-1.png and b/public/latent-spaces_files/figure-html/unnamed-chunk-1-1.png differ
diff --git a/public/latent-spaces_files/figure-html/unnamed-chunk-1-2.png b/public/latent-spaces_files/figure-html/unnamed-chunk-1-2.png
index ea6384cc5f7638c1f7f9ca0c41375fc93a215b51..0bf56123c75b511154a8a08d57cf4387b473570e 100644
Binary files a/public/latent-spaces_files/figure-html/unnamed-chunk-1-2.png and b/public/latent-spaces_files/figure-html/unnamed-chunk-1-2.png differ
diff --git a/public/latent-spaces_files/figure-html/unnamed-chunk-2-1.png b/public/latent-spaces_files/figure-html/unnamed-chunk-2-1.png
index 1d0307af2d208718904d5c7708d3edbb4de51111..40920e3051a849da438206235f1bcc7a7f736a60 100644
Binary files a/public/latent-spaces_files/figure-html/unnamed-chunk-2-1.png and b/public/latent-spaces_files/figure-html/unnamed-chunk-2-1.png differ
diff --git a/public/latent-spaces_files/figure-html/unnamed-chunk-3-1.png b/public/latent-spaces_files/figure-html/unnamed-chunk-3-1.png
index 4eeba48aa2286e1bde42a623f947ac5786838965..50c777f97922d2ca005c425f006650aa9eb56338 100644
Binary files a/public/latent-spaces_files/figure-html/unnamed-chunk-3-1.png and b/public/latent-spaces_files/figure-html/unnamed-chunk-3-1.png differ
diff --git a/public/projection.md b/public/projection.md
index 8cfbda96db8a3873c99268b42ebd61df29e3faa2..6ce4e2b713fddc14090e28c746e160b90f340cd6 100644
--- a/public/projection.md
+++ b/public/projection.md
@@ -372,14 +372,14 @@ plot(getSankey(colData(muraro)$cell_type1,  muraro_to_seger$scmap_cluster_labs[,
 ```
 
 <!-- Sankey generated in R 3.6.0 by googleVis 0.6.4 package -->
-<!-- Thu Oct  3 08:23:58 2019 -->
+<!-- Thu Oct  3 10:17:31 2019 -->
 
 
 <!-- jsHeader -->
 <script type="text/javascript">
  
 // jsData 
-function gvisDataSankeyID626a3560a3f1 () {
+function gvisDataSankeyID8acc41e47c55 () {
 var data = new google.visualization.DataTable();
 var datajson =
 [
@@ -577,8 +577,8 @@ return(data);
 }
  
 // jsDrawChart
-function drawChartSankeyID626a3560a3f1() {
-var data = gvisDataSankeyID626a3560a3f1();
+function drawChartSankeyID8acc41e47c55() {
+var data = gvisDataSankeyID8acc41e47c55();
 var options = {};
 options["width"] = 400;
 options["height"] = 400;
@@ -597,7 +597,7 @@ options["sankey"] = {
             };
 
     var chart = new google.visualization.Sankey(
-    document.getElementById('SankeyID626a3560a3f1')
+    document.getElementById('SankeyID8acc41e47c55')
     );
     chart.draw(data,options);
     
@@ -621,9 +621,9 @@ if (newPackage)
   pkgs.push(chartid);
   
 // Add the drawChart function to the global list of callbacks
-callbacks.push(drawChartSankeyID626a3560a3f1);
+callbacks.push(drawChartSankeyID8acc41e47c55);
 })();
-function displayChartSankeyID626a3560a3f1() {
+function displayChartSankeyID8acc41e47c55() {
   var pkgs = window.__gvisPackages = window.__gvisPackages || [];
   var callbacks = window.__gvisCallbacks = window.__gvisCallbacks || [];
   window.clearTimeout(window.__gvisLoad);
@@ -647,11 +647,11 @@ callbacks.shift()();
 </script>
  
 <!-- jsChart -->  
-<script type="text/javascript" src="https://www.google.com/jsapi?callback=displayChartSankeyID626a3560a3f1"></script>
+<script type="text/javascript" src="https://www.google.com/jsapi?callback=displayChartSankeyID8acc41e47c55"></script>
  
 <!-- divChart -->
   
-<div id="SankeyID626a3560a3f1" 
+<div id="SankeyID8acc41e47c55" 
   style="width: 400; height: 400;">
 </div>
 
@@ -883,7 +883,7 @@ Recently, the Seurat team proposed a new method to integrate single-cell
 datasets using "anchors" [@Stuart2019-zx].
 
 <div class="figure" style="text-align: center">
-<img src="figures/stuart_graphical_abstract.jpg" alt="Seurat integration with anchors." width="90%" />
+<img src="figures/stuart_graphical_abstract.png" alt="Seurat integration with anchors." width="90%" />
 <p class="caption">(\#fig:unnamed-chunk-35)Seurat integration with anchors.</p>
 </div>
 
diff --git a/public/pseudotime.md b/public/pseudotime.md
index e1e663fc1a6243bbfee25fd64313ece9f4c694cd..607363eab74695c95e93bd6e16478530cb4e91ce 100644
--- a/public/pseudotime.md
+++ b/public/pseudotime.md
@@ -361,7 +361,6 @@ heatmap(heatdata, Colv = NA,
 <img src="pseudotime_files/figure-html/gam_tm_deg-1.png" width="90%" style="display: block; margin: auto;" />
 
 
-We will regress each gene on the pseudotime variable we have generated, using a general additive model (GAM). This allows us to detect non-linear patterns in gene expression.
 
 ## Monocle
 
@@ -378,12 +377,12 @@ defined as a separate cell state.
 ### Monocle 2
 
 `Monocle 2` [@Qiu2017-xq] uses a different approach, with dimensionality reduction and ordering performed by reverse
-graph embedding (RGE), allowing it to detect branching events in an unsupervised manner. RGE, a machine-learning strategy, learns a 
-‘principal graph’ to describe the single-cell dataset.  RGE also learns the mapping function of data points on the trajectory back to the original high dimentional space simutaneously. In doing so, it aims to position the latent points in the lower dimension space (along the trajectory) while also ensuring their corresponding
+graph embedding (RGE), allowing it to detect branching events in an unsupervised manner. RGE, a machine-learning strategy, learns a ‘principal graph’ to describe the single-cell dataset. RGE also learns the mapping function of data points on the trajectory back to the original high dimentional space simutaneously. In doing so, it aims to position the latent points in the lower dimension space (along the trajectory) while also ensuring their corresponding
 positions in the input dimension are ‘neighbors’.
 
 There are different ways of implementing the RGE framework, `Monocle 2` uses `DDRTree`(Discriminative dimensionality reduction via learning a tree) by default. 
-DDRTree learns latent points and the projection of latent points to the points in original input space, which is equivalent to "dimension reduction". In addition, it simutanously learns 'principal graph' for K-means soft clustered cetroids for the latent points. Principal graph is the spanning tree of those centroids. 
+
+`DDRTree` learns latent points and the projection of latent points to the points in original input space, which is equivalent to "dimension reduction". In addition, it simutanously learns 'principal graph' for K-means soft clustered centroids for the latent points. Principal graph is the spanning tree of those centroids. 
 
 DDRTree returns a principal tree of the centroids of cell clusters in low dimension, pseudotime is derived for individual cells by calculating geomdestic distance of their projections onto the tree from the root (user-defined or arbitrarily assigned).
 
@@ -463,12 +462,11 @@ Monocle 2 performs pretty well on these cells.
 
 ### Monocle 3
 
-[`Monocle3`](https://www.nature.com/articles/s41586-019-0969-x)[@Cao2019-cj] is the updated single-cell analysis toolkit for analysing large datasets. [Monocle 3](https://cole-trapnell-lab.github.io/monocle3/docs/starting/) is designed for use with absolute transcript counts (e.g. from UMI experiments). It first does dimension reduction with UMAP and then clusters the cells with Louvian/Leiden algorithms and merge adjacent groups into supergroup, and finaly resovles the trajectories individual cells can take during development, identifies
-the locations of branches and convergences within each supergroup.
+[`Monocle3`](https://www.nature.com/articles/s41586-019-0969-x)[@Cao2019-cj] is the updated single-cell analysis toolkit for analysing large datasets. [Monocle 3](https://cole-trapnell-lab.github.io/monocle3/docs/starting/) is designed for use with absolute transcript counts (e.g. from UMI experiments). It first does dimension reduction with UMAP, then it clusters the cells with Louvain/Leiden algorithms and merge adjacent groups into supergroup, and finaly resovles the trajectories individual cells can take during development within each supergroup.
 
 In short, Monocle3 uses `UMAP` to construct a initial trajectory inference and refines it with learning principal graph.
 
-It builds KNN graph in the UMAP dimensions and runs Louvain/Leiden algorithms om the KNN graph to derive communities; edges are drawn to connect communities that have more links (Partitioned Approximate Graph Abstraction (PAGA) graph). Each component of the PAGA grah is passed to the next step which is learning principal graph based on the SimplePPT algorithm. The pseudotime is calculated for individual cells by projecting the cells to their nearest point on the principal graph edge and measure geodesic distance along of principal points to the closest of their root nodes.
+It builds KNN graph in the UMAP dimensions and runs Louvain/Leiden algorithms on the KNN graph to derive communities; edges are drawn to connect communities that have more links (Partitioned Approximate Graph Abstraction (PAGA) graph). Each component of the PAGA graph is passed to the next step which is learning principal graph based on the SimplePPT algorithm. The pseudotime is calculated for individual cells by projecting the cells to their nearest point on the principal graph edge and measure geodestic distance along of principal points to the closest of their root nodes.
 
 
 
@@ -539,7 +537,8 @@ plot_cells(cds, color_cells_by="cell_type2", graph_label_size = 4, cell_size = 2
 <img src="pseudotime_files/figure-html/run_monocle3-2.png" width="90%" style="display: block; margin: auto;" />
 
 ```r
-plot_cells(cds,  graph_label_size = 6, cell_size = 1, color_cells_by="pseudotime",
+plot_cells(cds,  graph_label_size = 6, cell_size = 1, 
+           color_cells_by="pseudotime",
            group_label_size = 6)
 ```
 
diff --git a/public/quality-control-and-data-visualisation.html b/public/quality-control-and-data-visualisation.html
index 5009b821c31b1b7bf2f5d67c8a03c9618899c1db..36e19720d908f829577f8659f963b78b0cabc159 100644
--- a/public/quality-control-and-data-visualisation.html
+++ b/public/quality-control-and-data-visualisation.html
@@ -834,39 +834,41 @@ doublet scores.</p>
 <a class="sourceLine" id="cb218-4" data-line-number="4"></a>
 <a class="sourceLine" id="cb218-5" data-line-number="5"><span class="co">#- Annotate doublet using binary classification based doublet scoring:</span></a>
 <a class="sourceLine" id="cb218-6" data-line-number="6">umi =<span class="st"> </span><span class="kw">bcds</span>(umi)</a></code></pre></div>
-<pre><code>## [1]  train-error:0.058012+0.009028   test-error:0.082759+0.011116 
+<pre><code>## [1]  train-error:0.066402+0.005931   test-error:0.103638+0.025177 
 ## Multiple eval metrics are present. Will use test_error for early stopping.
 ## Will train until test_error hasn&#39;t improved in 2 rounds.
 ## 
-## [2]  train-error:0.043255+0.006356   test-error:0.076937+0.011995 
-## [3]  train-error:0.033853+0.003265   test-error:0.064193+0.016507 
-## [4]  train-error:0.029800+0.006149   test-error:0.059020+0.009223 
-## [5]  train-error:0.024594+0.002186   test-error:0.053783+0.016495 
-## [6]  train-error:0.021410+0.002716   test-error:0.051489+0.011570 
-## [7]  train-error:0.017650+0.001547   test-error:0.052659+0.008845 
-## [8]  train-error:0.016202+0.002439   test-error:0.043973+0.006895 
-## [9]  train-error:0.014032+0.002261   test-error:0.046287+0.012793 
-## [10] train-error:0.011573+0.001440   test-error:0.042244+0.010619 
-## [11] train-error:0.009404+0.001444   test-error:0.039930+0.010749 
-## [12] train-error:0.009837+0.001076   test-error:0.039930+0.010904 
-## [13] train-error:0.008535+0.001904   test-error:0.038771+0.010294 
-## [14] train-error:0.007523+0.001079   test-error:0.038775+0.010791 
-## [15] train-error:0.006655+0.001675   test-error:0.036457+0.008707 
-## [16] train-error:0.005497+0.000577   test-error:0.037616+0.006611 
-## [17] train-error:0.004919+0.000542   test-error:0.037032+0.006923 
+## [2]  train-error:0.049912+0.002718   test-error:0.085687+0.015561 
+## [3]  train-error:0.036747+0.002381   test-error:0.079870+0.015979 
+## [4]  train-error:0.029948+0.002910   test-error:0.071214+0.015731 
+## [5]  train-error:0.026042+0.003332   test-error:0.072934+0.012966 
+## [6]  train-error:0.022424+0.002773   test-error:0.064263+0.011326 
+## [7]  train-error:0.019385+0.003075   test-error:0.058461+0.012930 
+## [8]  train-error:0.018084+0.001510   test-error:0.056722+0.009848 
+## [9]  train-error:0.016203+0.002440   test-error:0.052094+0.012606 
+## [10] train-error:0.013310+0.003702   test-error:0.049775+0.013542 
+## [11] train-error:0.011574+0.002856   test-error:0.046891+0.012536 
+## [12] train-error:0.010705+0.001959   test-error:0.046302+0.011468 
+## [13] train-error:0.009693+0.001081   test-error:0.043413+0.012598 
+## [14] train-error:0.008826+0.000545   test-error:0.042254+0.011269 
+## [15] train-error:0.007813+0.000709   test-error:0.040515+0.011465 
+## [16] train-error:0.006655+0.000705   test-error:0.042249+0.011692 
+## [17] train-error:0.005498+0.000584   test-error:0.042254+0.012804 
 ## Stopping. Best iteration:
-## [15] train-error:0.006655+0.001675   test-error:0.036457+0.008707
+## [15] train-error:0.007813+0.000709   test-error:0.040515+0.011465
 ## 
-## [1]  train-error:0.054977 
+## [1]  train-error:0.068866 
 ## Will train until train_error hasn&#39;t improved in 2 rounds.
 ## 
-## [2]  train-error:0.039931 
-## [3]  train-error:0.028356 
-## [4]  train-error:0.029514 
-## [5]  train-error:0.023148 
-## [6]  train-error:0.018519 
-## [7]  train-error:0.018519 
-## [8]  train-error:0.015625</code></pre>
+## [2]  train-error:0.038194 
+## [3]  train-error:0.034144 
+## [4]  train-error:0.026620 
+## [5]  train-error:0.024884 
+## [6]  train-error:0.020255 
+## [7]  train-error:0.020255 
+## [8]  train-error:0.019097 
+## [9]  train-error:0.016782 
+## [10] train-error:0.013310</code></pre>
 <div class="sourceCode" id="cb220"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb220-1" data-line-number="1"><span class="co">#- Combine both annotations into a hybrid annotation</span></a>
 <a class="sourceLine" id="cb220-2" data-line-number="2">umi =<span class="st"> </span><span class="kw">cxds_bcds_hybrid</span>(umi)</a>
 <a class="sourceLine" id="cb220-3" data-line-number="3"></a>
@@ -874,12 +876,12 @@ doublet scores.</p>
 <a class="sourceLine" id="cb220-5" data-line-number="5">CD  =<span class="st"> </span><span class="kw">colData</span>(umi)</a>
 <a class="sourceLine" id="cb220-6" data-line-number="6"><span class="kw">head</span>(<span class="kw">cbind</span>(CD<span class="op">$</span>cxds_score,CD<span class="op">$</span>bcds_score, CD<span class="op">$</span>hybrid_score))</a></code></pre></div>
 <pre><code>##                    [,1]        [,2]      [,3]
-## NA19098.r1.A01 4131.405 0.020812672 0.2531013
-## NA19098.r1.A02 4564.089 0.010029603 0.2674993
-## NA19098.r1.A03 2827.904 0.005551378 0.1611125
-## NA19098.r1.A04 4708.213 0.005341432 0.2711787
-## NA19098.r1.A05 6134.590 0.005762757 0.3552646
-## NA19098.r1.A06 5810.730 0.010443962 0.3410366</code></pre>
+## NA19098.r1.A01 4131.405 0.012810320 0.2455279
+## NA19098.r1.A02 4564.089 0.004930826 0.2628488
+## NA19098.r1.A03 2827.904 0.020504847 0.1769445
+## NA19098.r1.A04 4708.213 0.009793874 0.2762736
+## NA19098.r1.A05 6134.590 0.006487557 0.3565501
+## NA19098.r1.A06 5810.730 0.008279418 0.3393878</code></pre>
 <div class="sourceCode" id="cb222"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb222-1" data-line-number="1"><span class="kw">plotColData</span>(</a>
 <a class="sourceLine" id="cb222-2" data-line-number="2">    umi,</a>
 <a class="sourceLine" id="cb222-3" data-line-number="3">    <span class="dt">x =</span> <span class="st">&quot;total_features_by_counts&quot;</span>,</a>
diff --git a/public/search_index.json b/public/search_index.json
index 7ddd687528b39a40b355185d0dad41203489f331..7c30219f6669afab8b30457f08292de5ea5714eb 100644
--- a/public/search_index.json
+++ b/public/search_index.json
@@ -1,15 +1,15 @@
 [
 ["index.html", "Analysis of single cell RNA-seq data 1 About the course 1.1 Web page 1.2 GitLab 1.3 Video 1.4 Docker image 1.5 Manual installation 1.6 Citation 1.7 License 1.8 Prerequisites 1.9 Contact", " Analysis of single cell RNA-seq data Ruqian Lyu, PuXue Qiao, and Davis J. McCarthy (davisjmcc) 2019-10-03 1 About the course Today it is possible to obtain genome-wide transcriptome data from single cells using high-throughput sequencing (scRNA-seq). The cellular resolution and the genome-wide scope of scRNA-seq makes it possible to address issues that are intractable using other methods like bulk RNA-seq or single-cell RT-qPCR. However, scRNA-seq data poses many challenges due to the scale and complexity of scRNA-seq datasets, with novel methods often required to account for the particular characteristics of the data. In this course we will discuss some of the questions that can be addressed using scRNA-seq as well as the available computational and statistical methods. We will cover key features of the technology platforms and fundamental principles of scRNA-seq data analysis that are transferable across technologies and analysis workflows. The number of computational tools is already vast and increasing rapidly, so we provide hands-on workflows using some of our favourite tools on carefully selected, biologically-relevant example datasets. Across two days, attendees can expect to gain an understanding of approaches to and practical analysis experience on: quality control, data normalisation, visualisation, clustering, trajectory (pseudotime) inference, differential expression, batch correction and data integration. Course outline: Day 1: Morning session 1: Workshop overview; introduction to scRNA-seq; pre-processing scRNA-seq data Morning session 2: Quality control, visualisation and exploratory data analysis Afternoon session 1: Normalisation, confounders and batch correction Afternoon session 2: Latent spaces, clustering and cell annotation Day 2: Morning session 1: Trajectory inference Morning session 2: Differential expression; data imputation Afternoon session 1: Combining datasets and data integration Afternoon session 2: Case studies This course has been adapted from a course taught through the University of Cambridge Bioinformatics training unit, but the material is meant for anyone interested in learning about computational analysis of scRNA-seq data and is updated roughly twice per year. The number of computational tools is increasing rapidly and we are doing our best to keep up to date with what is available. One of the main constraints for this course is that we would like to use tools that are implemented in R and that run reasonably fast. Moreover, we will also confess to being somewhat biased towards methods that have been developed either by us or by our friends and colleagues. 1.1 Web page The html version of the workshop material is available at the following link: https://biocellgen-public.svi.edu.au/mig_2019_scrnaseq-workshop/public/index.html 1.2 GitLab The source code and materials for the course are available at the SVI Bioinformatics and Cellular Genomics Lab’s GitLab: https://gitlab.svi.edu.au/biocellgen-public/mig_2019_scrnaseq-workshop 1.3 Video This video was recorded during the course (2 days) in May 2019 in Cambridge, UK. This recorded version of the course differs slightly from the version in this document. 1.3.1 Day 1 1.3.2 Day 2 1.4 Docker image The course can be reproduced without any package installation by running the course docker image which contains all the required packages. Workshop Docker Repository on DockerHub 1.4.1 Run the image Make sure Docker is installed on your system. If not, please follow these instructions. To run the course docker image (use the latest version): docker run -p 8888:8888 -e PASSWORD=&quot;jupyter&quot; svibiocellgen/mig_2019_scrnaseq-workshop:v1.01 Then follow the instructions provided, e.g.: To access the notebook, open this file in a browser: file:///home/jovyan/.local/share/jupyter/runtime/nbserver-6-open.html Or copy and paste one of these URLs: http://(a9ee1aad5398 or 127.0.0.1):8888/?token=22debff49d9aae3c50e6b0b5241b47eefb1b8f883fcb7e6d A Jupyter session will be open in a web browser (we recommend Chrome). 1.4.1.1 Windows users On Windows operating system the IP address of the container can be different from 127.0.0.1 (localhost). To find the IP address please run: docker-machine ip default 1.4.2 Download data/other files 1.4.2.1 Download from AWS (within Docker) Recommended if you are using Docker In the Jupyter session, please click on New -&gt; Terminal. In the new terminal window please run: ./poststart.sh 1.4.2.2 Manual download from AWS If you want to download data files from AWS outside of Docker image you can still use the same poststart.sh script but you will need to install AWS CLI on your computer. Alternatively, you can browse and download the files in you web-browser by visiting this link. NB: Only the core datasets (i.e. not Tabula Muris) are available from AWS storage. 1.4.2.3 Manual download from SVI Recommended if you are using your own computer For simplicity, we have also hosted the core datasets used in the course and a subset of the Tabula Muris data on SVI websites. There are two files to download, both “tarballs”, i.e. compressed archives of multiple folders and files. 1.4.2.3.1 Core datasets To download the core datasets, click this link (195Mb). It is most convenient to download the tarball to the head directory for the course. We then want to unpack the tarball and move it to a directory called data in the head directory of the repository. To do this at the command line: wget https://www.svi.edu.au/MIG_2019_scRNAseq-workshop/mig-sc-workshop-2019-data.tar.gz mkdir workshop-data tar -xvf mig-sc-workshop-2019-data.tar.gz --directory workshop-data mv workshop-data/mnt/mcfiles/Datasets/MIG_2019_scRNAseq-workshop/data ./ rm -r workshop-data [This requires a little bit of faff to get all of the directory paths correct and then tidy updated.] Alternatively, if you are working on your laptop, unpack the tarball using the default method on your system (usually a double click on the *.tar.gz file will do the trick) and drag and drop the data folder to the workshop directory. 1.4.2.3.2 Tabula Muris To download the Tabula Muris data, clink this link (655Mb). We then go through a similar process as described above to unpack the tarball. wget https://www.svi.edu.au/MIG_2019_scRNAseq-workshop/Tabula_Muris.tar.gz tar -xvf Tabula_Muris.tar.gz mv mnt/mcfiles/Datasets/Tabula_Muris data rm -r mnt 1.4.2.3.3 Desired results The data folder then should contain both the core datasets and the Tabula Muris data, and have the following structure: data ├── 10cells_barcodes.txt ├── 2000_reference.transcripts.fa ├── deng │   └── deng-reads.rds ├── droplet_id_example_per_barcode.txt.gz ├── droplet_id_example_truth.gz ├── EXAMPLE.cram ├── pancreas │   ├── muraro.rds │   └── segerstolpe.rds ├── pbmc3k_filtered_gene_bc_matrices │   └── hg19 │   ├── barcodes.tsv │   ├── genes.tsv │   └── matrix.mtx ├── sce │   ├── Heart_10X.rds │   └── Thymus_10X.rds ├── Tabula_Muris │   ├── droplet │   │   ├── droplet │   │   ├── droplet_annotation.csv │   │   └── droplet_metadata.csv │   └── FACS_smartseq2 │   ├── FACS │   ├── FACS_annotations.csv │   └── FACS_metadata.csv └── tung ├── annotation.txt ├── molecules.txt ├── reads.txt ├── TNs.txt └── TPs.txt 11 directories, 22 files With the files in these locations, everything is set up to run the code as presented in the RMarkdown files in the workshop. 1.4.3 RStudio Now go back to Jupyter browser tab and change word tree in the url to rstudio. RStudio server will open with all of the course files, software and the data folder available. 1.5 Manual installation If you are not using a docker image of the course, then to be able to run all code chunks of the course you need to clone or download the course GitHub repository and start an R session in the course_files folder. You will also need to install all required packages manually. We are using Bioconductor version 3.9 packages in this version of the course. The install.R file in the workshop repository provides the necessary commands for installing all of the required packages. You can run this script from the command line with Rscript install.R or copy-and-paste the commands into an R session and run them interactively. Alternatively, you can just install packages listed in a chapter of interest. 1.6 Citation This version of the workshop has been updated by Davis J. McCarthy, Ruqian Lyu and PuXue Qiao, based on the 2019-07-01 version of the course: Ruqian Lyu, PuXue Qiao, Vladimir Kiselev, Tallulah Andrews, Jennifer Westoby, Maren Büttner, Jimmy Lee, Krzysztof Polanski, Sebastian Y. Müller, Elo Madissoon, Stephane Ballereau, Maria Do Nascimento Lopes Primo, Rocio Martinez Nunez, Martin Hemberg and Davis J. McCarthy, (2019), “Analysis of single cell RNA-seq data”, https://scrnaseq-course.cog.sanger.ac.uk/website/index.html 1.7 License All of the course material is licensed under GPL-3. Anyone is welcome to go through the material in order to learn about analysis of scRNA-seq data. If you plan to use the material for your own teaching, we would appreciate if you tell us about it in addition to providing a suitable citation. 1.8 Prerequisites The course is intended for those who have basic familiarity with Unix and the R statistical language. We will also assume that you are familiar with mapping and analysing bulk RNA-seq data as well as with the commonly available computational tools. We recommend attending the Introduction to RNA-seq and ChIP-seq data analysis or the Analysis of high-throughput sequencing data with Bioconductor before attending this course. 1.9 Contact If you have any comments, questions or suggestions about the material, please contact Davis McCarthy. "],
 ["introduction-to-single-cell-rna-seq.html", "2 Introduction to single-cell RNA-seq 2.1 Bulk RNA-seq 2.2 scRNA-seq 2.3 Workflow 2.4 Computational Analysis 2.5 Challenges 2.6 Experimental methods 2.7 What platform to use for my experiment? 2.8 Unique Molecular Identifiers (UMIs)", " 2 Introduction to single-cell RNA-seq 2.1 Bulk RNA-seq A major breakthrough (replaced microarrays) in the late 00’s and has been widely used since Measures the average expression level for each gene across a large population of input cells Useful for comparative transcriptomics, e.g. samples of the same tissue from different species Useful for quantifying expression signatures from ensembles, e.g. in disease studies Insufficient for studying heterogeneous systems, e.g. early development studies, complex tissues (brain) Does not provide insights into the stochastic nature of gene expression 2.2 scRNA-seq A new technology, first publication by (Tang et al. 2009) Did not gain widespread popularity until ~2014 when new protocols and lower sequencing costs made it more accessible Measures the distribution of expression levels for each gene across a population of cells Allows to study new biological questions in which cell-specific changes in transcriptome are important, e.g. cell type identification, heterogeneity of cell responses, stochasticity of gene expression, inference of gene regulatory networks across the cells. Datasets range from \\(10^2\\) to \\(10^6\\) cells and increase in size every year Currently there are several different protocols in use, e.g. SMART-seq2 (Picelli et al. 2013), CELL-seq (Hashimshony et al. 2012) and Drop-seq (Macosko et al. 2015) There are also commercial platforms available, including the Fluidigm C1, Wafergen ICELL8 and the 10X Genomics Chromium Several computational analysis methods from bulk RNA-seq can be used In most cases computational analysis requires adaptation of the existing methods or development of new ones 2.3 Workflow Figure 2.1: Single cell sequencing (taken from Wikipedia) Overall, experimental scRNA-seq protocols are similar to the methods used for bulk RNA-seq. We will be discussing some of the most common approaches in the next chapter. 2.4 Computational Analysis This course is concerned with the computational analysis of the data obtained from scRNA-seq experiments. The first steps (yellow) are general for any highthroughput sequencing data. Later steps (orange) require a mix of existing RNASeq analysis methods and novel methods to address the technical difference of scRNASeq. Finally the biological interpretation (blue) should be analyzed with methods specifically developed for scRNASeq. Figure 2.2: Flowchart of the scRNA-seq analysis There are several reviews of the scRNA-seq analysis available including (Stegle, Teichmann, and Marioni 2015). Today, there are also several different platforms available for carrying out one or more steps in the flowchart above. These include: Falco a single-cell RNA-seq processing framework on the cloud. SCONE (Single-Cell Overview of Normalized Expression), a package for single-cell RNA-seq data quality control and normalization. Seurat is an R package designed for QC, analysis, and exploration of single cell RNA-seq data. ASAP (Automated Single-cell Analysis Pipeline) is an interactive web-based platform for single-cell analysis. Bioconductor is a open-source, open-development software project for the analysis of high-throughput genomics data, including packages for the analysis of single-cell data. 2.5 Challenges The main difference between bulk and single cell RNA-seq is that each sequencing library represents a single cell, instead of a population of cells. Therefore, significant attention has to be paid to comparison of the results from different cells (sequencing libraries). The main sources of discrepancy between the libraries are: Reverse transcription to convert RNA to cDNA is at best &lt;30% efficient Amplification (up to 1 million fold) Gene ‘dropouts’ in which a gene is observed at a moderate expression level in one cell but is not detected in another cell (Kharchenko, Silberstein, and Scadden 2014); this can be due to technical factors (e.g. inefficient RT) or true biological variability across cells. These discrepancies are introduced due to low starting amounts of transcripts since the RNA comes from one cell only. Improving the transcript capture efficiency and reducing the amplification bias are currently active areas of research. However, as we shall see in this course, it is possible to alleviate some of these issues through proper normalization and corrections and effective statistical models. For the analyst, the characteristics of single-cell RNA-seq data lead to challenges in handling: Sparsity Variability Scalability Complexity In this workshop we will present computational approaches that can allow us to face these challenges as we try to answer biological questions of interest from single-cell transcriptomic data. 2.6 Experimental methods Figure 2.3: Moore’s law in single cell transcriptomics (image taken from Svensson et al) Development of new methods and protocols for scRNA-seq is currently a very active area of research, and several protocols have been published over the last few years. A non-comprehensive list includes: CEL-seq (Hashimshony et al. 2012) CEL-seq2 (Hashimshony et al. 2016) Drop-seq (Macosko et al. 2015) InDrop-seq (Klein et al. 2015) MARS-seq (Jaitin et al. 2014) SCRB-seq (Soumillon et al. 2014) Seq-well (Gierahn et al. 2017) Smart-seq (Picelli et al. 2014) Smart-seq2 (Picelli et al. 2014) SMARTer STRT-seq (Islam et al. 2013) The methods can be categorized in different ways, but the two most important aspects are quantification and capture. For quantification, there are two types, full-length and tag-based. The former tries to achieve a uniform read coverage of each transcript. By contrast, tag-based protocols only capture either the 5’- or 3’-end of each RNA. The choice of quantification method has important implications for what types of analyses the data can be used for. In theory, full-length protocols should provide an even coverage of transcripts, but as we shall see, there are often biases in the coverage. The main advantage of tag-based protocol is that they can be combined with unique molecular identifiers (UMIs) which can help improve the quantification (see chapter 2.8). On the other hand, being restricted to one end of the transcript may reduce the mappability and it also makes it harder to distinguish different isoforms (Archer et al. 2016). The strategy used for capture determines throughput, how the cells can be selected as well as what kind of additional information besides the sequencing that can be obtained. The three most widely used options are microwell-, microfluidic- and droplet- based. Figure 2.4: Image of microwell plates (image taken from Wikipedia) For well-based platforms, cells are isolated using for example pipette or laser capture and placed in microfluidic wells. One advantage of well-based methods is that they can be combined with fluorescent activated cell sorting (FACS), making it possible to select cells based on surface markers. This strategy is thus very useful for situations when one wants to isolate a specific subset of cells for sequencing. Another advantage is that one can take pictures of the cells. The image provides an additional modality and a particularly useful application is to identify wells containg damaged cells or doublets. The main drawback of these methods is that they are often low-throughput and the amount of work required per cell may be considerable. Figure 2.5: Image of a 96-well Fluidigm C1 chip (image taken from Fluidigm) Microfluidic platforms, such as Fluidigm’s C1, provide a more integrated system for capturing cells and for carrying out the reactions necessary for the library preparations. Thus, they provide a higher throughput than microwell based platforms. Typically, only around 10% of cells are captured in a microfluidic platform and thus they are not appropriate if one is dealing with rare cell-types or very small amounts of input. Moreover, the chip is relatively expensive, but since reactions can be carried out in a smaller volume money can be saved on reagents. Figure 2.6: Schematic overview of the drop-seq method (Image taken from Macosko et al) The idea behind droplet based methods is to encapsulate each individual cell inside a nanoliter droplet together with a bead. The bead is loaded with the enzymes required to construct the library. In particular, each bead contains a unique barcode which is attached to all of the reads originating from that cell. Thus, all of the droplets can be pooled, sequenced together and the reads can subsequently be assigned to the cell of origin based on the barcodes. Droplet platforms typically have the highest throughput since the library preparation costs are on the order of \\(.05\\) USD/cell. Instead, sequencing costs often become the limiting factor and a typical experiment the coverage is low with only a few thousand different transcripts detected (Ziegenhain et al. 2017). 2.7 What platform to use for my experiment? The most suitable platform depends on the biological question at hand. For example, if one is interested in characterizing the composition of a tissue, then a droplet-based method which will allow a very large number of cells to be captured is likely to be the most appropriate. On the other hand, if one is interesting in characterizing a rare cell-population for which there is a known surface marker, then it is probably best to enrich using FACS and then sequence a smaller number of cells. Clearly, full-length transcript quantification will be more appropriate if one is interested in studying different isoforms since tagged protocols are much more limited. By contrast, UMIs can only be used with tagged protocols and they can facilitate gene-level quantification. Two recent studies from the Enard group (Ziegenhain et al. 2017) and the Teichmann group (Svensson et al. 2017) have compared several different protocols. In their study, Ziegenhain et al compared five different protocols on the same sample of mouse embryonic stem cells (mESCs). By controlling for the number of cells as well as the sequencing depth, the authors were able to directly compare the sensitivity, noise-levels and costs of the different protocols. One example of their conclusions is illustrated in the figure below which shows the number of genes detected (for a given detection threshold) for the different methods. As you can see, there is almost a two-fold difference between drop-seq and Smart-seq2, suggesting that the choice of protocol can have a major impact on the study Figure 2.7: Enard group study Svensson et al take a different approach by using synthetic transcripts (spike-ins, more about these later) with known concentrations to measure the accuracy and sensitivity of different protocols. Comparing a wide range of studies, they also reported substantial differences between the protocols. Figure 2.8: Teichmann group study As protocols are developed and computational methods for quantifying the technical noise are improved, it is likely that future studies will help us gain further insights regarding the strengths of the different methods. These comparative studies are helpful not only for helping researchers decide which protocol to use, but also for developing new methods as the benchmarking makes it possible to determine what strategies are the most useful ones. 2.8 Unique Molecular Identifiers (UMIs) Thanks to Andreas Buness from EMBL Monterotondo for collaboration on this section. 2.8.1 Introduction Unique Molecular Identifiers are short (4-10bp) random barcodes added to transcripts during reverse-transcription. They enable sequencing reads to be assigned to individual transcript molecules and thus the removal of amplification noise and biases from scRNASeq data. Figure 2.9: UMI sequencing protocol When sequencing UMI containing data, techniques are used to specifically sequence only the end of the transcript containing the UMI (usually the 3’ end). 2.8.2 Mapping Barcodes Since the number of unique barcodes (\\(4^N\\), where \\(N\\) is the length of UMI) is much smaller than the total number of molecules per cell (~\\(10^6\\)), each barcode will typically be assigned to multiple transcripts. Hence, to identify unique molecules both barcode and mapping location (transcript) must be used. The first step is to map UMI reads, for which we recommend using STAR since it is fast and outputs good quality BAM-alignments. Moreover, mapping locations can be useful for eg. identifying poorly-annotated 3’ UTRs of transcripts. UMI-sequencing typically consists of paired-end reads where one read from each pair captures the cell and UMI barcodes while the other read consists of exonic sequence from the transcript (Figure 2.10). Note that trimming and/or filtering to remove reads containing poly-A sequence is recommended to avoid erors due to these read mapping to genes/transcripts with internal poly-A/poly-T sequences. After processing the reads from a UMI experiment, the following conventions are often used: The UMI is added to the read name of the other paired read. Reads are sorted into separate files by cell barcode For extremely large, shallow datasets, the cell barcode may be added to the read name as well to reduce the number of files. Figure 2.10: UMI sequencing reads, red lightning bolts represent different fragmentation locations 2.8.3 Counting Barcodes In theory, every unique UMI-transcript pair should represent all reads originating from a single RNA molecule. However, in practice this is frequently not the case and the most common reasons are: Different UMI does not necessarily mean different molecule Due to PCR or sequencing errors, base-pair substitution events can result in new UMI sequences. Longer UMIs give more opportunity for errors to arise and based on estimates from cell barcodes we expect 7-10% of 10bp UMIs to contain at least one error. If not corrected for, this type of error will result in an overestimate of the number of transcripts. Different transcript does not necessarily mean different molecule Mapping errors and/or multimapping reads may result in some UMIs being assigned to the wrong gene/transcript. This type of error will also result in an overestimate of the number of transcripts. Same UMI does not necessarily mean same molecule Biases in UMI frequency and short UMIs can result in the same UMI being attached to different mRNA molecules from the same gene. Thus, the number of transcripts may be underestimated. Figure 2.11: Potential Errors in UMIs 2.8.4 Correcting for Errors How to best account for errors in UMIs remains an active area of research. The best approaches that we are aware of for resolving the issues mentioned above are: UMI-tools’ directional-adjacency method implements a procedure which considers both the number of mismatches and the relative frequency of similar UMIs to identify likely PCR/sequencing errors. Currently an open question. The problem may be mitigated by removing UMIs with few reads to support their association with a particular transcript, or by removing all multi-mapping reads. Simple saturation (aka “collision probability”) correction proposed by Grun, Kester and van Oudenaarden (2014) to estimate the true number of molecules \\(M\\): \\[M \\approx -N*log(1 - \\frac{n}{N})\\] where N = total number of unique UMI barcodes and n = number of observed barcodes. An important caveat of this method is that it assumes that all UMIs are equally frequent. In most cases this is incorrect, since there is often a bias related to the GC content. Figure 2.12: Per gene amplification rate Determining how to best process and use UMIs is currently an active area of research in the bioinformatics community. We are aware of several methods that have recently been developed, including: UMI-tools PoissonUMIs zUMIs dropEst 2.8.5 Downstream Analysis Current UMI platforms (DropSeq, InDrop, ICell8) exhibit low and highly variable capture efficiency as shown in the figure below. Figure 2.13: Variability in Capture Efficiency This variability can introduce strong biases and it needs to be considered in downstream analysis. Recent analyses often pool cells/genes together based on cell-type or biological pathway to increase the power. Robust statistical analyses of this data is still an open research question and it remains to be determined how to best adjust for biases. Exercise 1 We have provided you with UMI counts and read counts from induced pluripotent stem cells generated from three different individuals (Tung et al. 2017) (see: Chapter 6.1 for details of this dataset). umi_counts &lt;- read.table(&quot;data/tung/molecules.txt&quot;, sep = &quot;\\t&quot;) read_counts &lt;- read.table(&quot;data/tung/reads.txt&quot;, sep = &quot;\\t&quot;) Using this data: Plot the variability in capture efficiency Determine the amplification rate: average number of reads per UMI. References "],
-["introduction-to-rbioconductor.html", "3 Introduction to R/Bioconductor 3.1 Installing packages 3.2 Installation instructions: 3.3 Data-types/classes 3.4 Basic data structures 3.5 Accessing documentation and help files 3.6 Data Types 3.7 What is Bioconductor? 3.8 SingleCellExperiment class 3.9 scater package 3.10 Introduction to ggplot2", " 3 Introduction to R/Bioconductor 3.1 Installing packages 3.1.1 CRAN The Comprehensive R Archive Network CRAN is the biggest archive of R packages. There are few requirements for uploading packages besides building and installing succesfully, hence documentation and support is often minimal and figuring how to use these packages can be a challenge it itself. CRAN is the default repository R will search to find packages to install: install.packages(&quot;devtools&quot;) require(&quot;devtools&quot;) 3.1.2 Github Github isn’t specific to R, any code of any type in any state can be uploaded. There is no guarantee a package uploaded to github will even install, nevermind do what it claims to do. R packages can be downloaded and installed directly from github using the “devtools” package installed above. devtools::install_github(&quot;tallulandrews/M3Drop&quot;) Github is also a version control system which stores multiple versions of any package. By default the most recent “master” version of the package is installed. If you want an older version or the development branch this can be specified using the “ref” parameter: # different branch devtools::install_github(&quot;tallulandrews/M3D&quot;, ref=&quot;nbumi&quot;) # previous commit devtools::install_github(&quot;tallulandrews/M3Drop&quot;, ref=&quot;434d2da28254acc8de4940c1dc3907ac72973135&quot;) Note: make sure you re-install the M3Drop master branch for later in the course. 3.1.3 Bioconductor Bioconductor is a repository of R-packages specifically for biological analyses. It has the strictest requirements for submission, including installation on every platform and full documentation with a tutorial (called a vignette) explaining how the package should be used. Bioconductor also encourages utilization of standard data structures/classes and coding style/naming conventions, so that, in theory, packages and analyses can be combined into large pipelines or workflows. source(&quot;https://bioconductor.org/biocLite.R&quot;) biocLite(&quot;edgeR&quot;) Note: in some situations it is necessary to substitute “http://” for “https://” in the above depending on the security features of your internet connection/network. Bioconductor also requires creators to support their packages and has a regular 6-month release schedule. Make sure you are using the most recent release of bioconductor before trying to install packages for the course. source(&quot;https://bioconductor.org/biocLite.R&quot;) biocLite(&quot;BiocUpgrade&quot;) 3.1.4 Source The final way to install packages is directly from source. In this case you have to download a fully built source code file, usually packagename.tar.gz, or clone the github repository and rebuild the package yourself. Generally this will only be done if you want to edit a package yourself, or if for some reason the former methods have failed. install.packages(&quot;M3Drop_3.05.00.tar.gz&quot;, type=&quot;source&quot;) 3.2 Installation instructions: All the packages necessary for this course are available here. Starting from “RUN Rscript -e”install.packages(‘devtools’)&quot; “, run each of the commands (minus”RUN&quot;) on the command line or start an R session and run each of the commands within the quotation marks. Note the ordering of the installation is important in some cases, so make sure you run them in order from top to bottom. 3.3 Data-types/classes R is a high level language so the underlying data-type is generally not important. The exception if you are accessing R data directly using another language such as C, but that is beyond the scope of this course. Instead we will consider the basic data classes: numeric, integer, logical, and character, and the higher level data class called “factor”. You can check what class your data is using the “class()” function. Aside: R can also store data as “complex” for complex numbers but generally this isn’t relevant for biological analyses. 3.3.1 Numeric The “numeric” class is the default class for storing any numeric data - integers, decimal numbers, numbers in scientific notation, etc… x = 1.141 class(x) ## [1] &quot;numeric&quot; y = 42 class(y) ## [1] &quot;numeric&quot; z = 6.02e23 class(z) ## [1] &quot;numeric&quot; Here we see that even though R has an “integer” class and 42 could be stored more efficiently as an integer the default is to store it as “numeric”. If we want 42 to be stored as an integer we must “coerce” it to that class: y = as.integer(42) class(y) ## [1] &quot;integer&quot; Coercion will force R to store data as a particular class, if our data is incompatible with that class it will still do it but the data will be converted to NAs: as.numeric(&quot;H&quot;) ## Warning: NAs introduced by coercion ## [1] NA Above we tried to coerce “character” data, identified by the double quotation marks, into numeric data which doesn’t make sense, so we triggered (“threw”) an warning message. Since this is only a warning R would continue with any subsequent commands in a script/function, whereas an “error” would cause R to halt. 3.3.2 Character/String The “character” class stores all kinds of text data. Programing convention calls data containing multiple letters a “string”, thus most R functions which act on character data will refer to the data as “strings” and will often have “str” or “string” in it’s name. Strings are identified by being flanked by double quotation marks, whereas variable/function names are not: x = 5 a = &quot;x&quot; # character &quot;x&quot; a ## [1] &quot;x&quot; b = x # variable x b ## [1] 5 In addition to standard alphanumeric characters, strings can also store various special characters. Special characters are identified using a backlash followed by a single character, the most relevant are the special character for tab : \\t and new line : \\n. To demonstrate the these special characters lets concatenate (cat) together two strings with these characters separating (sep) them: cat(&quot;Hello&quot;, &quot;World&quot;, sep= &quot; &quot;) ## Hello World cat(&quot;Hello&quot;, &quot;World&quot;, sep= &quot;\\t&quot;) ## Hello World cat(&quot;Hello&quot;, &quot;World&quot;, sep= &quot;\\n&quot;) ## Hello ## World Note that special characters work differently in different functions. For instance the paste function does the same thing as cat but does not recognize special characters. paste(&quot;Hello&quot;, &quot;World&quot;, sep= &quot; &quot;) ## [1] &quot;Hello World&quot; paste(&quot;Hello&quot;, &quot;World&quot;, sep= &quot;\\t&quot;) ## [1] &quot;Hello\\tWorld&quot; paste(&quot;Hello&quot;, &quot;World&quot;, sep= &quot;\\n&quot;) ## [1] &quot;Hello\\nWorld&quot; Single or double backslash is also used as an escape character to turn off special characters or allow quotation marks to be included in strings: cat(&quot;This \\&quot;string\\&quot; contains quotation marks.&quot;) ## This &quot;string&quot; contains quotation marks. Special characters are generally only used in pattern matching, and reading/writing data to files. For instance this is how you would read a tab-separated file into R. dat = read.delim(&quot;file.tsv&quot;, sep=&quot;\\t&quot;) Another special type of character data are colours. Colours can be specified in three main ways: by name from those available, by red, green, blue values using the rgb function, and by hue (colour), saturation (colour vs white) and value (colour/white vs black) using the hsv function. By default rgb and hsv expect three values in 0-1 with an optional fourth value for transparency. Alternatively, sets of predetermined colours with useful properties can be loaded from many different packages with RColorBrewer being one of the most popular. reds = c(&quot;red&quot;, rgb(1,0,0), hsv(0, 1, 1)) reds ## [1] &quot;red&quot; &quot;#FF0000&quot; &quot;#FF0000&quot; barplot(c(1,1,1), col=reds, names=c(&quot;by_name&quot;, &quot;by_rgb&quot;, &quot;by_hsv&quot;)) 3.3.3 Logical The logical class stores boolean truth values, i.e. TRUE and FALSE. It is used for storing the results of logical operations and conditional statements will be coerced to this class. Most other data-types can be coerced to boolean without triggering (or “throwing”) error messages, which may cause unexpected behaviour. x = TRUE class(x) ## [1] &quot;logical&quot; y = &quot;T&quot; as.logical(y) ## [1] TRUE z = 5 as.logical(z) ## [1] TRUE x = FALSE class(x) ## [1] &quot;logical&quot; y = &quot;F&quot; as.logical(y) ## [1] FALSE z = 0 as.logical(z) ## [1] FALSE Exercise 1 Experiment with other character and numeric values, which are coerced to TRUE or FALSE? which are coerced to neither? Do you ever throw a warning/error message? 3.3.4 Factors String/Character data is very memory inefficient to store, each letter generally requires the same amount of memory as any integer. Thus when storing a vector of strings with repeated elements it is more efficient assign each element to an integer and store the vector as integers and an additional string-to-integer association table. Thus, by default R will read in text columns of a data table as factors. str_vector = c(&quot;Apple&quot;, &quot;Apple&quot;, &quot;Banana&quot;, &quot;Banana&quot;, &quot;Banana&quot;, &quot;Carrot&quot;, &quot;Carrot&quot;, &quot;Apple&quot;, &quot;Banana&quot;) factored_vector = factor(str_vector) factored_vector ## [1] Apple Apple Banana Banana Banana Carrot Carrot Apple Banana ## Levels: Apple Banana Carrot as.numeric(factored_vector) ## [1] 1 1 2 2 2 3 3 1 2 The double nature of factors can cause some unintuitive behaviour. E.g. joining two factors together will convert them to the numeric form and the original strings will be lost. c(factored_vector, factored_vector) ## [1] 1 1 2 2 2 3 3 1 2 1 1 2 2 2 3 3 1 2 Likewise if due to formatting issues numeric data is mistakenly interpretted as strings, then you must convert the factor back to strings before coercing to numeric values: x = c(&quot;20&quot;, &quot;25&quot;, &quot;23&quot;, &quot;38&quot;, &quot;20&quot;, &quot;40&quot;, &quot;25&quot;, &quot;30&quot;) x = factor(x) as.numeric(x) ## [1] 1 3 2 5 1 6 3 4 as.numeric(as.character(x)) ## [1] 20 25 23 38 20 40 25 30 To make R read text as character data instead of factors set the environment option stringsAsFactors=FALSE. This must be done at the start of each R session. options(stringsAsFactors=FALSE) Exercise How would you use factors to create a vector of colours for an arbitrarily long vector of fruits like str_vector above? Answer 3.3.5 Checking class/type We recommend checking your data is of the correct class after reading from files: x = 1.4 is.numeric(x) ## [1] TRUE is.character(x) ## [1] FALSE is.logical(x) ## [1] FALSE is.factor(x) ## [1] FALSE 3.4 Basic data structures So far we have only looked at single values and vectors. Vectors are the simplest data structure in R. They are a 1-dimensional array of data all of the same type. If the input when creating a vector is of different types it will be coerced to the data-type that is most consistent with the data. x = c(&quot;Hello&quot;, 5, TRUE) x ## [1] &quot;Hello&quot; &quot;5&quot; &quot;TRUE&quot; class(x) ## [1] &quot;character&quot; Here we tried to put character, numeric and logical data into a single vector so all the values were coerced to character data. A matrix is the two dimensional version of a vector, it also requires all data to be of the same type. If we combine a character vector and a numeric vector into a matrix, all the data will be coerced to characters: x = c(&quot;A&quot;, &quot;B&quot;, &quot;C&quot;) y = c(1, 2, 3) class(x) ## [1] &quot;character&quot; class(y) ## [1] &quot;numeric&quot; m = cbind(x, y) m ## x y ## [1,] &quot;A&quot; &quot;1&quot; ## [2,] &quot;B&quot; &quot;2&quot; ## [3,] &quot;C&quot; &quot;3&quot; The quotation marks indicate that the numeric vector has been coerced to characters. Alternatively, to store data with columns of different data-types we can use a dataframe. z = data.frame(x, y) z ## x y ## 1 A 1 ## 2 B 2 ## 3 C 3 class(z[,1]) ## [1] &quot;character&quot; class(z[,2]) ## [1] &quot;numeric&quot; If you have set stringsAsFactors=FALSE as above you will find the first column remains characters, otherwise it will be automatically converted to a factor. options(stringsAsFactors=TRUE) z = data.frame(x, y) class(z[,1]) ## [1] &quot;factor&quot; Another difference between matrices and dataframes is the ability to select columns using the $ operator: m$x # throws an error z$x # ok The final basic data structure is the list. Lists allow data of different types and different lengths to be stored in a single object. Each element of a list can be any other R object : data of any type, any data structure, even other lists or functions. l = list(m, z) ll = list(sublist=l, a_matrix=m, numeric_value=42, this_string=&quot;Hello World&quot;, even_a_function=cbind) ll ## $sublist ## $sublist[[1]] ## x y ## [1,] &quot;A&quot; &quot;1&quot; ## [2,] &quot;B&quot; &quot;2&quot; ## [3,] &quot;C&quot; &quot;3&quot; ## ## $sublist[[2]] ## x y ## 1 A 1 ## 2 B 2 ## 3 C 3 ## ## ## $a_matrix ## x y ## [1,] &quot;A&quot; &quot;1&quot; ## [2,] &quot;B&quot; &quot;2&quot; ## [3,] &quot;C&quot; &quot;3&quot; ## ## $numeric_value ## [1] 42 ## ## $this_string ## [1] &quot;Hello World&quot; ## ## $even_a_function ## function (..., deparse.level = 1) ## .Internal(cbind(deparse.level, ...)) ## &lt;bytecode: 0x55dd34ac6108&gt; ## &lt;environment: namespace:base&gt; Lists are most commonly used when returning a large number of results from a function that do not fit into any of the previous data structures. 3.5 Accessing documentation and help files You can get more information about any R commands relevant to these datatypes using by typing ?function in an interactive session. 3.6 Data Types 3.6.1 What is Tidy Data? Tidy data is a concept largely defined by Hadley Wickham (Wickham 2014). Tidy data has the following three characteristics: Each variable has its own column. Each observation has its own row. Each value has its own cell. Here is an example of some tidy data: ## Students Subject Years Score ## 1 Mark Maths 1 5 ## 2 Jane Biology 2 6 ## 3 Mohammed Physics 3 4 ## 4 Tom Maths 2 7 ## 5 Celia Computing 3 9 Here is an example of some untidy data: ## Students Sport Category Counts ## 1 Matt Tennis Wins 0 ## 2 Matt Tennis Losses 1 ## 3 Ellie Rugby Wins 3 ## 4 Ellie Rugby Losses 2 ## 5 Tim Football Wins 1 ## 6 Tim Football Losses 4 ## 7 Louise Swimming Wins 2 ## 8 Louise Swimming Losses 2 ## 9 Kelly Running Wins 5 ## 10 Kelly Running Losses 1 Task 1: In what ways is the untidy data not tidy? How could we make the untidy data tidy? Tidy data is generally easier to work with than untidy data, especially if you are working with packages such as ggplot. Fortunately, packages are available to make untidy data tidy. Today we will explore a few of the functions available in the tidyr package which can be used to make untidy data tidy. If you are interested in finding out more about tidying data, we recommend reading “R for Data Science”, by Garrett Grolemund and Hadley Wickham. An electronic copy is available here: http://r4ds.had.co.nz/ The untidy data above is untidy because two variables (Wins and Losses) are stored in one column (Category). This is a common way in which data can be untidy. To tidy this data, we need to make Wins and Losses into columns, and store the values in Counts in these columns. Fortunately, there is a function from the tidyverse packages to perform this operation. The function is called spread, and it takes two arguments, key and value. You should pass the name of the column which contains multiple variables to key, and pass the name of the column which contains values from multiple variables to value. For example: library(tidyverse) sports&lt;-data.frame(Students=c(&quot;Matt&quot;, &quot;Matt&quot;, &quot;Ellie&quot;, &quot;Ellie&quot;, &quot;Tim&quot;, &quot;Tim&quot;, &quot;Louise&quot;, &quot;Louise&quot;, &quot;Kelly&quot;, &quot;Kelly&quot;), Sport=c(&quot;Tennis&quot;,&quot;Tennis&quot;, &quot;Rugby&quot;, &quot;Rugby&quot;,&quot;Football&quot;, &quot;Football&quot;,&quot;Swimming&quot;,&quot;Swimming&quot;, &quot;Running&quot;, &quot;Running&quot;), Category=c(&quot;Wins&quot;, &quot;Losses&quot;, &quot;Wins&quot;, &quot;Losses&quot;, &quot;Wins&quot;, &quot;Losses&quot;, &quot;Wins&quot;, &quot;Losses&quot;, &quot;Wins&quot;, &quot;Losses&quot;), Counts=c(0,1,3,2,1,4,2,2,5,1)) sports ## Students Sport Category Counts ## 1 Matt Tennis Wins 0 ## 2 Matt Tennis Losses 1 ## 3 Ellie Rugby Wins 3 ## 4 Ellie Rugby Losses 2 ## 5 Tim Football Wins 1 ## 6 Tim Football Losses 4 ## 7 Louise Swimming Wins 2 ## 8 Louise Swimming Losses 2 ## 9 Kelly Running Wins 5 ## 10 Kelly Running Losses 1 spread(sports, key=Category, value=Counts) ## Students Sport Losses Wins ## 1 Ellie Rugby 2 3 ## 2 Kelly Running 1 5 ## 3 Louise Swimming 2 2 ## 4 Matt Tennis 1 0 ## 5 Tim Football 4 1 Task 2: The dataframe foods defined below is untidy. Work out why and use spread() to tidy it foods&lt;-data.frame(student=c(&quot;Antoinette&quot;,&quot;Antoinette&quot;,&quot;Taylor&quot;, &quot;Taylor&quot;, &quot;Alexa&quot;, &quot;Alexa&quot;), Category=c(&quot;Dinner&quot;, &quot;Dessert&quot;, &quot;Dinner&quot;, &quot;Dessert&quot;, &quot;Dinner&quot;,&quot;Dessert&quot;), Frequency=c(3,1,4,5,2,1)) The other common way in which data can be untidy is if the columns are values instead of variables. For example, the dataframe below shows the percentages some students got in tests they did in May and June. The data is untidy because the columns May and June are values, not variables. percentages&lt;-data.frame(student=c(&quot;Alejandro&quot;, &quot;Pietro&quot;, &quot;Jane&quot;), &quot;May&quot;=c(90,12,45), &quot;June&quot;=c(80,30,100)) Fortunately, there is a function in the tidyverse packages to deal with this problem too. gather() takes the names of the columns which are values, the key and the value as arguments. This time, the key is the name of the variable with values as column names, and the value is the name of the variable with values spread over multiple columns. Ie: gather(percentages, &quot;May&quot;, &quot;June&quot;, key=&quot;Month&quot;, value = &quot;Percentage&quot;) ## student Month Percentage ## 1 Alejandro May 90 ## 2 Pietro May 12 ## 3 Jane May 45 ## 4 Alejandro June 80 ## 5 Pietro June 30 ## 6 Jane June 100 These examples don’t have much to do with single-cell RNA-seq analysis, but are designed to help illustrate the features of tidy and untidy data. You will find it much easier to analyse your single-cell RNA-seq data if your data is stored in a tidy format. Fortunately, the data structures we commonly use to facilitate single-cell RNA-seq analysis usually encourage store your data in a tidy manner. 3.6.2 What is Rich Data? If you google ‘rich data’, you will find lots of different definitions for this term. In this course, we will use ‘rich data’ to mean data which is generated by combining information from multiple sources. For example, you could make rich data by creating an object in R which contains a matrix of gene expression values across the cells in your single-cell RNA-seq experiment, but also information about how the experiment was performed. Objects of the SingleCellExperiment class, which we will discuss below, are an example of rich data. Typically, Bioconductor packages make use of rich data objects that have many advantages for package developers and users alike. 3.7 What is Bioconductor? From Wikipedia: Bioconductor is a free, open source and open development software project for the analysis and comprehension of genomic data generated by wet lab experiments in molecular biology. Bioconductor is based primarily on the statistical R programming language, but does contain contributions in other programming languages. It has two releases each year that follow the semiannual releases of R. At any one time there is a release version,which corresponds to the released version of R, and a development version, which corresponds to the development version of R. Most users will find the release version appropriate for their needs. We strongly recommend all new comers and even experienced high-throughput data analysts to use well developed and maintained Bioconductor methods and classes. 3.8 SingleCellExperiment class SingleCellExperiment (SCE) is a S4 class for storing data from single-cell experiments. This includes specialized methods to store and retrieve spike-in information, dimensionality reduction coordinates and size factors for each cell, along with the usual metadata for genes and libraries. In practice, an object of this class can be created using its constructor: library(SingleCellExperiment) counts &lt;- matrix(rpois(100, lambda = 10), ncol=10, nrow=10) rownames(counts) &lt;- paste(&quot;gene&quot;, 1:10, sep = &quot;&quot;) colnames(counts) &lt;- paste(&quot;cell&quot;, 1:10, sep = &quot;&quot;) sce &lt;- SingleCellExperiment( assays = list(counts = counts), rowData = data.frame(gene_names = paste(&quot;gene_name&quot;, 1:10, sep = &quot;&quot;)), colData = data.frame(cell_names = paste(&quot;cell_name&quot;, 1:10, sep = &quot;&quot;)) ) sce ## class: SingleCellExperiment ## dim: 10 10 ## metadata(0): ## assays(1): counts ## rownames(10): gene1 gene2 ... gene9 gene10 ## rowData names(1): gene_names ## colnames(10): cell1 cell2 ... cell9 cell10 ## colData names(1): cell_names ## reducedDimNames(0): ## spikeNames(0): In the SingleCellExperiment, users can assign arbitrary names to entries of assays. To assist interoperability between packages, some suggestions for what the names should be for particular types of data are provided by the authors: counts: Raw count data, e.g., number of reads or transcripts for a particular gene. normcounts: Normalized values on the same scale as the original counts. For example, counts divided by cell-specific size factors that are centred at unity. logcounts: Log-transformed counts or count-like values. In most cases, this will be defined as log-transformed normcounts, e.g., using log base 2 and a pseudo-count of 1. cpm: Counts-per-million. This is the read count for each gene in each cell, divided by the library size of each cell in millions. tpm: Transcripts-per-million. This is the number of transcripts for each gene in each cell, divided by the total number of transcripts in that cell (in millions). Each of these suggested names has an appropriate getter/setter method for convenient manipulation of the SingleCellExperiment. For example, we can take the (very specifically named) counts slot, normalise it and assign it to normcounts instead: normcounts(sce) &lt;- log2(counts(sce) + 1) sce ## class: SingleCellExperiment ## dim: 10 10 ## metadata(0): ## assays(2): counts normcounts ## rownames(10): gene1 gene2 ... gene9 gene10 ## rowData names(1): gene_names ## colnames(10): cell1 cell2 ... cell9 cell10 ## colData names(1): cell_names ## reducedDimNames(0): ## spikeNames(0): dim(normcounts(sce)) ## [1] 10 10 head(normcounts(sce)) ## cell1 cell2 cell3 cell4 cell5 cell6 cell7 ## gene1 3.169925 3.169925 2.000000 2.584963 2.584963 3.321928 3.584963 ## gene2 3.459432 1.584963 3.584963 3.807355 3.700440 3.700440 3.000000 ## gene3 3.000000 3.169925 3.807355 3.169925 3.321928 3.321928 3.321928 ## gene4 3.584963 3.459432 3.000000 3.807355 3.700440 3.700440 3.700440 ## gene5 3.906891 3.000000 3.169925 3.321928 3.584963 3.459432 3.807355 ## gene6 3.700440 3.700440 3.584963 4.000000 3.169925 3.000000 3.459432 ## cell8 cell9 cell10 ## gene1 3.321928 3.807355 2.807355 ## gene2 3.807355 3.700440 4.000000 ## gene3 2.584963 4.000000 3.700440 ## gene4 3.169925 3.584963 3.700440 ## gene5 3.807355 2.584963 3.584963 ## gene6 3.321928 3.459432 4.000000 3.9 scater package scater is a R package for single-cell RNA-seq analysis (McCarthy et al. 2017). The package contains several useful methods for quality control, visualisation and pre-processing of data prior to further downstream analysis. scater features the following functionality: Automated computation of QC metrics Transcript quantification from read data with pseudo-alignment Data format standardisation Rich visualizations for exploratory analysis Seamless integration into the Bioconductor universe Simple normalisation methods We highly recommend to use scater for all single-cell RNA-seq analyses and scater is the basis of the first part of the course. As illustrated in the figure below, scater will help you with quality control, filtering and normalization of your expression matrix following mapping and alignment. Keep in mind that this figure represents the original version of scater where an SCESet class was used. In the newest version this figure is still correct, except that SCESet can be substituted with the SingleCellExperiment class. 3.10 Introduction to ggplot2 3.10.1 What is ggplot2? ggplot2 is an R package designed by Hadley Wickham which facilitates data plotting. In this lab, we will touch briefly on some of the features of the package. If you would like to learn more about how to use ggplot2, we would recommend reading “ggplot2 Elegant graphics for data analysis”, by Hadley Wickham. 3.10.2 Principles of ggplot2 Your data must be a dataframe if you want to plot it using ggplot2. Use the aes mapping function to specify how variables in the dataframe map to features on your plot Use geoms to specify how your data should be represented on your graph eg. as a scatterplot, a barplot, a boxplot etc. 3.10.3 Using the aes mapping function The aes function specifies how variables in your dataframe map to features on your plot. To understand how this works, let’s look at an example: library(ggplot2) library(tidyverse) set.seed(1) counts &lt;- as.data.frame(matrix(rpois(100, lambda = 10), ncol=10, nrow=10)) Gene_ids &lt;- paste(&quot;gene&quot;, 1:10, sep = &quot;&quot;) colnames(counts) &lt;- paste(&quot;cell&quot;, 1:10, sep = &quot;&quot;) counts&lt;-data.frame(Gene_ids, counts) counts ## Gene_ids cell1 cell2 cell3 cell4 cell5 cell6 cell7 cell8 cell9 cell10 ## 1 gene1 8 8 3 5 5 9 11 9 13 6 ## 2 gene2 10 2 11 13 12 12 7 13 12 15 ## 3 gene3 7 8 13 8 9 9 9 5 15 12 ## 4 gene4 11 10 7 13 12 12 12 8 11 12 ## 5 gene5 14 7 8 9 11 10 13 13 5 11 ## 6 gene6 12 12 11 15 8 7 10 9 10 15 ## 7 gene7 11 11 14 11 11 5 9 13 13 7 ## 8 gene8 9 12 9 8 6 14 7 12 12 10 ## 9 gene9 14 12 11 7 10 10 8 14 7 10 ## 10 gene10 11 10 9 7 11 16 8 7 7 4 ggplot(data = counts, mapping = aes(x = cell1, y = cell2)) Let’s take a closer look at the final command, ggplot(data = counts, mapping = aes(x = cell1, y = cell2)). ggplot() initialises a ggplot object and takes the arguments data and mapping. We pass our dataframe of counts to data and use the aes() function to specify that we would like to use the variable cell1 as our x variable and the variable cell2 as our y variable. Task 1: Modify the command above to initialise a ggplot object where cell10 is the x variable and cell8 is the y variable. Clearly, the plots we have just created are not very informative because no data is displayed on them. To display data, we will need to use geoms. 3.10.4 Geoms We can use geoms to specify how we would like data to be displayed on our graphs. For example, our choice of geom could specify that we would like our data to be displayed as a scatterplot, a barplot or a boxplot. Let’s see how our graph would look as a scatterplot. ggplot(data = counts, mapping = aes(x = cell1, y = cell2)) + geom_point() Now we can see that there doesn’t seem to be any correlation between gene expression in cell1 and cell2. Given we generated counts randomly, this isn’t too surprising. Task 2: Modify the command above to create a line plot. Hint: execute ?ggplot and scroll down the help page. At the bottom is a link to the ggplot package index. Scroll through the index until you find the geom options. 3.10.5 Plotting data from more than 2 cells So far we’ve been considering the gene counts from 2 of the cells in our dataframe. But there are actually 10 cells in our dataframe and it would be nice to compare all of them. What if we wanted to plot data from all 10 cells at the same time? At the moment we can’t do this because we are treating each individual cell as a variable and assigning that variable to either the x or the y axis. We could create a 10 dimensional graph to plot data from all 10 cells on, but this is a) not possible to do with ggplot and b) not very easy to interpret. What we could do instead is to tidy our data so that we had one variable representing cell ID and another variable representing gene counts, and plot those against each other. In code, this would look like: counts&lt;-gather(counts, colnames(counts)[2:11], key = &#39;Cell_ID&#39;, value=&#39;Counts&#39;) head(counts) ## Gene_ids Cell_ID Counts ## 1 gene1 cell1 8 ## 2 gene2 cell1 10 ## 3 gene3 cell1 7 ## 4 gene4 cell1 11 ## 5 gene5 cell1 14 ## 6 gene6 cell1 12 Essentially, the problem before was that our data was not tidy because one variable (Cell_ID) was spread over multiple columns. Now that we’ve fixed this problem, it is much easier for us to plot data from all 10 cells on one graph. ggplot(counts,aes(x=Cell_ID, y=Counts)) + geom_boxplot() Task 3: Use the updated counts dataframe to plot a barplot with Cell_ID as the x variable and Counts as the y variable. Hint: you may find it helpful to read ?geom_bar. Task 4: Use the updated counts dataframe to plot a scatterplot with Gene_ids as the x variable and Counts as the y variable. 3.10.6 Plotting heatmaps A common method for visualising gene expression data is with a heatmap. Here we will use the R package pheatmap to perform this analysis with some gene expression data we will name test. library(pheatmap) set.seed(2) test = matrix(rnorm(200), 20, 10) test[1:10, seq(1, 10, 2)] = test[1:10, seq(1, 10, 2)] + 3 test[11:20, seq(2, 10, 2)] = test[11:20, seq(2, 10, 2)] + 2 test[15:20, seq(2, 10, 2)] = test[15:20, seq(2, 10, 2)] + 4 colnames(test) = paste(&quot;Cell&quot;, 1:10, sep = &quot;&quot;) rownames(test) = paste(&quot;Gene&quot;, 1:20, sep = &quot;&quot;) pheatmap(test) Let’s take a moment to work out what this graphic is showing us. Each row represents a gene and each column represents a cell. How highly expressed each gene is in each cell is represented by the colour of the corresponding box. For example, we can tell from this plot that gene18 is highly expressed in cell10 but lowly expressed in cell1. This plot also gives us information on the results of a clustering algorithm. In general, clustering algorithms aim to split datapoints (eg.cells) into groups whose members are more alike one another than they are alike the rest of the datapoints. The trees drawn on the top and left hand sides of the graph are the results of clustering algorithms and enable us to see, for example, that cells 4,8,2,6 and 10 are more alike one another than they are alike cells 7,3,5,1 and 9. The tree on the left hand side of the graph represents the results of a clustering algorithm applied to the genes in our dataset. If we look closely at the trees, we can see that eventually they have the same number of branches as there are cells and genes. In other words, the total number of cell clusters is the same as the total number of cells, and the total number of gene clusters is the same as the total number of genes. Clearly, this is not very informative, and will become impractical when we are looking at more than 10 cells and 20 genes. Fortunately, we can set the number of clusters we see on the plot. Let’s try setting the number of gene clusters to 2: pheatmap(test, kmeans_k = 2) Now we can see that the genes fall into two clusters - a cluster of 8 genes which are upregulated in cells 2, 10, 6, 4 and 8 relative to the other cells and a cluster of 12 genes which are downregulated in cells 2, 10, 6, 4 and 8 relative to the other cells. Task 5: Try setting the number of clusters to 3. Which number of clusters do you think is more informative? 3.10.7 Principal Component Analysis Principal component analysis (PCA) is a statistical procedure that uses a transformation to convert a set of observations into a set of values of linearly uncorrelated variables called principal components. The transformation is carried out so that the first principle component accounts for as much of the variability in the data as possible, and each following principle component accounts for the greatest amount of variance possible under the contraint that it must be orthogonal to the previous components. PCA plots are a good way to get an overview of your data, and can sometimes help identify confounders which explain a high amount of the variability in your data. We will investigate how we can use PCA plots in single-cell RNA-seq analysis in more depth in a future lab, here the aim is to give you an overview of what PCA plots are and how they are generated. Let’s make a PCA plot for our test data. We can use the ggfortify package to let ggplot know how to interpret principle components. library(ggfortify) Principal_Components&lt;-prcomp(test) autoplot(Principal_Components, label=TRUE) Task 6: Compare your clusters to the pheatmap clusters. Are they related? (Hint: have a look at the gene tree for the first pheatmap we plotted) Task 7: Produce a heatmap and PCA plot for counts (below): set.seed(1) counts &lt;- as.data.frame(matrix(rpois(100, lambda = 10), ncol=10, nrow=10)) rownames(counts) &lt;- paste(&quot;gene&quot;, 1:10, sep = &quot;&quot;) colnames(counts) &lt;- paste(&quot;cell&quot;, 1:10, sep = &quot;&quot;) References "],
+["introduction-to-rbioconductor.html", "3 Introduction to R/Bioconductor 3.1 Installing packages 3.2 Installation instructions: 3.3 Data-types/classes 3.4 Basic data structures 3.5 Accessing documentation and help files 3.6 Data Types 3.7 What is Bioconductor? 3.8 SingleCellExperiment class 3.9 scater package 3.10 Introduction to ggplot2", " 3 Introduction to R/Bioconductor 3.1 Installing packages 3.1.1 CRAN The Comprehensive R Archive Network CRAN is the biggest archive of R packages. There are few requirements for uploading packages besides building and installing succesfully, hence documentation and support is often minimal and figuring how to use these packages can be a challenge it itself. CRAN is the default repository R will search to find packages to install: install.packages(&quot;devtools&quot;) require(&quot;devtools&quot;) 3.1.2 Github Github isn’t specific to R, any code of any type in any state can be uploaded. There is no guarantee a package uploaded to github will even install, nevermind do what it claims to do. R packages can be downloaded and installed directly from github using the “devtools” package installed above. devtools::install_github(&quot;tallulandrews/M3Drop&quot;) Github is also a version control system which stores multiple versions of any package. By default the most recent “master” version of the package is installed. If you want an older version or the development branch this can be specified using the “ref” parameter: # different branch devtools::install_github(&quot;tallulandrews/M3D&quot;, ref=&quot;nbumi&quot;) # previous commit devtools::install_github(&quot;tallulandrews/M3Drop&quot;, ref=&quot;434d2da28254acc8de4940c1dc3907ac72973135&quot;) Note: make sure you re-install the M3Drop master branch for later in the course. 3.1.3 Bioconductor Bioconductor is a repository of R-packages specifically for biological analyses. It has the strictest requirements for submission, including installation on every platform and full documentation with a tutorial (called a vignette) explaining how the package should be used. Bioconductor also encourages utilization of standard data structures/classes and coding style/naming conventions, so that, in theory, packages and analyses can be combined into large pipelines or workflows. source(&quot;https://bioconductor.org/biocLite.R&quot;) biocLite(&quot;edgeR&quot;) Note: in some situations it is necessary to substitute “http://” for “https://” in the above depending on the security features of your internet connection/network. Bioconductor also requires creators to support their packages and has a regular 6-month release schedule. Make sure you are using the most recent release of bioconductor before trying to install packages for the course. source(&quot;https://bioconductor.org/biocLite.R&quot;) biocLite(&quot;BiocUpgrade&quot;) 3.1.4 Source The final way to install packages is directly from source. In this case you have to download a fully built source code file, usually packagename.tar.gz, or clone the github repository and rebuild the package yourself. Generally this will only be done if you want to edit a package yourself, or if for some reason the former methods have failed. install.packages(&quot;M3Drop_3.05.00.tar.gz&quot;, type=&quot;source&quot;) 3.2 Installation instructions: All the packages necessary for this course are available here. Starting from “RUN Rscript -e”install.packages(‘devtools’)&quot; “, run each of the commands (minus”RUN&quot;) on the command line or start an R session and run each of the commands within the quotation marks. Note the ordering of the installation is important in some cases, so make sure you run them in order from top to bottom. 3.3 Data-types/classes R is a high level language so the underlying data-type is generally not important. The exception if you are accessing R data directly using another language such as C, but that is beyond the scope of this course. Instead we will consider the basic data classes: numeric, integer, logical, and character, and the higher level data class called “factor”. You can check what class your data is using the “class()” function. Aside: R can also store data as “complex” for complex numbers but generally this isn’t relevant for biological analyses. 3.3.1 Numeric The “numeric” class is the default class for storing any numeric data - integers, decimal numbers, numbers in scientific notation, etc… x = 1.141 class(x) ## [1] &quot;numeric&quot; y = 42 class(y) ## [1] &quot;numeric&quot; z = 6.02e23 class(z) ## [1] &quot;numeric&quot; Here we see that even though R has an “integer” class and 42 could be stored more efficiently as an integer the default is to store it as “numeric”. If we want 42 to be stored as an integer we must “coerce” it to that class: y = as.integer(42) class(y) ## [1] &quot;integer&quot; Coercion will force R to store data as a particular class, if our data is incompatible with that class it will still do it but the data will be converted to NAs: as.numeric(&quot;H&quot;) ## Warning: NAs introduced by coercion ## [1] NA Above we tried to coerce “character” data, identified by the double quotation marks, into numeric data which doesn’t make sense, so we triggered (“threw”) an warning message. Since this is only a warning R would continue with any subsequent commands in a script/function, whereas an “error” would cause R to halt. 3.3.2 Character/String The “character” class stores all kinds of text data. Programing convention calls data containing multiple letters a “string”, thus most R functions which act on character data will refer to the data as “strings” and will often have “str” or “string” in it’s name. Strings are identified by being flanked by double quotation marks, whereas variable/function names are not: x = 5 a = &quot;x&quot; # character &quot;x&quot; a ## [1] &quot;x&quot; b = x # variable x b ## [1] 5 In addition to standard alphanumeric characters, strings can also store various special characters. Special characters are identified using a backlash followed by a single character, the most relevant are the special character for tab : \\t and new line : \\n. To demonstrate the these special characters lets concatenate (cat) together two strings with these characters separating (sep) them: cat(&quot;Hello&quot;, &quot;World&quot;, sep= &quot; &quot;) ## Hello World cat(&quot;Hello&quot;, &quot;World&quot;, sep= &quot;\\t&quot;) ## Hello World cat(&quot;Hello&quot;, &quot;World&quot;, sep= &quot;\\n&quot;) ## Hello ## World Note that special characters work differently in different functions. For instance the paste function does the same thing as cat but does not recognize special characters. paste(&quot;Hello&quot;, &quot;World&quot;, sep= &quot; &quot;) ## [1] &quot;Hello World&quot; paste(&quot;Hello&quot;, &quot;World&quot;, sep= &quot;\\t&quot;) ## [1] &quot;Hello\\tWorld&quot; paste(&quot;Hello&quot;, &quot;World&quot;, sep= &quot;\\n&quot;) ## [1] &quot;Hello\\nWorld&quot; Single or double backslash is also used as an escape character to turn off special characters or allow quotation marks to be included in strings: cat(&quot;This \\&quot;string\\&quot; contains quotation marks.&quot;) ## This &quot;string&quot; contains quotation marks. Special characters are generally only used in pattern matching, and reading/writing data to files. For instance this is how you would read a tab-separated file into R. dat = read.delim(&quot;file.tsv&quot;, sep=&quot;\\t&quot;) Another special type of character data are colours. Colours can be specified in three main ways: by name from those available, by red, green, blue values using the rgb function, and by hue (colour), saturation (colour vs white) and value (colour/white vs black) using the hsv function. By default rgb and hsv expect three values in 0-1 with an optional fourth value for transparency. Alternatively, sets of predetermined colours with useful properties can be loaded from many different packages with RColorBrewer being one of the most popular. reds = c(&quot;red&quot;, rgb(1,0,0), hsv(0, 1, 1)) reds ## [1] &quot;red&quot; &quot;#FF0000&quot; &quot;#FF0000&quot; barplot(c(1,1,1), col=reds, names=c(&quot;by_name&quot;, &quot;by_rgb&quot;, &quot;by_hsv&quot;)) 3.3.3 Logical The logical class stores boolean truth values, i.e. TRUE and FALSE. It is used for storing the results of logical operations and conditional statements will be coerced to this class. Most other data-types can be coerced to boolean without triggering (or “throwing”) error messages, which may cause unexpected behaviour. x = TRUE class(x) ## [1] &quot;logical&quot; y = &quot;T&quot; as.logical(y) ## [1] TRUE z = 5 as.logical(z) ## [1] TRUE x = FALSE class(x) ## [1] &quot;logical&quot; y = &quot;F&quot; as.logical(y) ## [1] FALSE z = 0 as.logical(z) ## [1] FALSE Exercise 1 Experiment with other character and numeric values, which are coerced to TRUE or FALSE? which are coerced to neither? Do you ever throw a warning/error message? 3.3.4 Factors String/Character data is very memory inefficient to store, each letter generally requires the same amount of memory as any integer. Thus when storing a vector of strings with repeated elements it is more efficient assign each element to an integer and store the vector as integers and an additional string-to-integer association table. Thus, by default R will read in text columns of a data table as factors. str_vector = c(&quot;Apple&quot;, &quot;Apple&quot;, &quot;Banana&quot;, &quot;Banana&quot;, &quot;Banana&quot;, &quot;Carrot&quot;, &quot;Carrot&quot;, &quot;Apple&quot;, &quot;Banana&quot;) factored_vector = factor(str_vector) factored_vector ## [1] Apple Apple Banana Banana Banana Carrot Carrot Apple Banana ## Levels: Apple Banana Carrot as.numeric(factored_vector) ## [1] 1 1 2 2 2 3 3 1 2 The double nature of factors can cause some unintuitive behaviour. E.g. joining two factors together will convert them to the numeric form and the original strings will be lost. c(factored_vector, factored_vector) ## [1] 1 1 2 2 2 3 3 1 2 1 1 2 2 2 3 3 1 2 Likewise if due to formatting issues numeric data is mistakenly interpretted as strings, then you must convert the factor back to strings before coercing to numeric values: x = c(&quot;20&quot;, &quot;25&quot;, &quot;23&quot;, &quot;38&quot;, &quot;20&quot;, &quot;40&quot;, &quot;25&quot;, &quot;30&quot;) x = factor(x) as.numeric(x) ## [1] 1 3 2 5 1 6 3 4 as.numeric(as.character(x)) ## [1] 20 25 23 38 20 40 25 30 To make R read text as character data instead of factors set the environment option stringsAsFactors=FALSE. This must be done at the start of each R session. options(stringsAsFactors=FALSE) Exercise How would you use factors to create a vector of colours for an arbitrarily long vector of fruits like str_vector above? Answer 3.3.5 Checking class/type We recommend checking your data is of the correct class after reading from files: x = 1.4 is.numeric(x) ## [1] TRUE is.character(x) ## [1] FALSE is.logical(x) ## [1] FALSE is.factor(x) ## [1] FALSE 3.4 Basic data structures So far we have only looked at single values and vectors. Vectors are the simplest data structure in R. They are a 1-dimensional array of data all of the same type. If the input when creating a vector is of different types it will be coerced to the data-type that is most consistent with the data. x = c(&quot;Hello&quot;, 5, TRUE) x ## [1] &quot;Hello&quot; &quot;5&quot; &quot;TRUE&quot; class(x) ## [1] &quot;character&quot; Here we tried to put character, numeric and logical data into a single vector so all the values were coerced to character data. A matrix is the two dimensional version of a vector, it also requires all data to be of the same type. If we combine a character vector and a numeric vector into a matrix, all the data will be coerced to characters: x = c(&quot;A&quot;, &quot;B&quot;, &quot;C&quot;) y = c(1, 2, 3) class(x) ## [1] &quot;character&quot; class(y) ## [1] &quot;numeric&quot; m = cbind(x, y) m ## x y ## [1,] &quot;A&quot; &quot;1&quot; ## [2,] &quot;B&quot; &quot;2&quot; ## [3,] &quot;C&quot; &quot;3&quot; The quotation marks indicate that the numeric vector has been coerced to characters. Alternatively, to store data with columns of different data-types we can use a dataframe. z = data.frame(x, y) z ## x y ## 1 A 1 ## 2 B 2 ## 3 C 3 class(z[,1]) ## [1] &quot;character&quot; class(z[,2]) ## [1] &quot;numeric&quot; If you have set stringsAsFactors=FALSE as above you will find the first column remains characters, otherwise it will be automatically converted to a factor. options(stringsAsFactors=TRUE) z = data.frame(x, y) class(z[,1]) ## [1] &quot;factor&quot; Another difference between matrices and dataframes is the ability to select columns using the $ operator: m$x # throws an error z$x # ok The final basic data structure is the list. Lists allow data of different types and different lengths to be stored in a single object. Each element of a list can be any other R object : data of any type, any data structure, even other lists or functions. l = list(m, z) ll = list(sublist=l, a_matrix=m, numeric_value=42, this_string=&quot;Hello World&quot;, even_a_function=cbind) ll ## $sublist ## $sublist[[1]] ## x y ## [1,] &quot;A&quot; &quot;1&quot; ## [2,] &quot;B&quot; &quot;2&quot; ## [3,] &quot;C&quot; &quot;3&quot; ## ## $sublist[[2]] ## x y ## 1 A 1 ## 2 B 2 ## 3 C 3 ## ## ## $a_matrix ## x y ## [1,] &quot;A&quot; &quot;1&quot; ## [2,] &quot;B&quot; &quot;2&quot; ## [3,] &quot;C&quot; &quot;3&quot; ## ## $numeric_value ## [1] 42 ## ## $this_string ## [1] &quot;Hello World&quot; ## ## $even_a_function ## function (..., deparse.level = 1) ## .Internal(cbind(deparse.level, ...)) ## &lt;bytecode: 0x55ef7883b0f8&gt; ## &lt;environment: namespace:base&gt; Lists are most commonly used when returning a large number of results from a function that do not fit into any of the previous data structures. 3.5 Accessing documentation and help files You can get more information about any R commands relevant to these datatypes using by typing ?function in an interactive session. 3.6 Data Types 3.6.1 What is Tidy Data? Tidy data is a concept largely defined by Hadley Wickham (Wickham 2014). Tidy data has the following three characteristics: Each variable has its own column. Each observation has its own row. Each value has its own cell. Here is an example of some tidy data: ## Students Subject Years Score ## 1 Mark Maths 1 5 ## 2 Jane Biology 2 6 ## 3 Mohammed Physics 3 4 ## 4 Tom Maths 2 7 ## 5 Celia Computing 3 9 Here is an example of some untidy data: ## Students Sport Category Counts ## 1 Matt Tennis Wins 0 ## 2 Matt Tennis Losses 1 ## 3 Ellie Rugby Wins 3 ## 4 Ellie Rugby Losses 2 ## 5 Tim Football Wins 1 ## 6 Tim Football Losses 4 ## 7 Louise Swimming Wins 2 ## 8 Louise Swimming Losses 2 ## 9 Kelly Running Wins 5 ## 10 Kelly Running Losses 1 Task 1: In what ways is the untidy data not tidy? How could we make the untidy data tidy? Tidy data is generally easier to work with than untidy data, especially if you are working with packages such as ggplot. Fortunately, packages are available to make untidy data tidy. Today we will explore a few of the functions available in the tidyr package which can be used to make untidy data tidy. If you are interested in finding out more about tidying data, we recommend reading “R for Data Science”, by Garrett Grolemund and Hadley Wickham. An electronic copy is available here: http://r4ds.had.co.nz/ The untidy data above is untidy because two variables (Wins and Losses) are stored in one column (Category). This is a common way in which data can be untidy. To tidy this data, we need to make Wins and Losses into columns, and store the values in Counts in these columns. Fortunately, there is a function from the tidyverse packages to perform this operation. The function is called spread, and it takes two arguments, key and value. You should pass the name of the column which contains multiple variables to key, and pass the name of the column which contains values from multiple variables to value. For example: library(tidyverse) sports&lt;-data.frame(Students=c(&quot;Matt&quot;, &quot;Matt&quot;, &quot;Ellie&quot;, &quot;Ellie&quot;, &quot;Tim&quot;, &quot;Tim&quot;, &quot;Louise&quot;, &quot;Louise&quot;, &quot;Kelly&quot;, &quot;Kelly&quot;), Sport=c(&quot;Tennis&quot;,&quot;Tennis&quot;, &quot;Rugby&quot;, &quot;Rugby&quot;,&quot;Football&quot;, &quot;Football&quot;,&quot;Swimming&quot;,&quot;Swimming&quot;, &quot;Running&quot;, &quot;Running&quot;), Category=c(&quot;Wins&quot;, &quot;Losses&quot;, &quot;Wins&quot;, &quot;Losses&quot;, &quot;Wins&quot;, &quot;Losses&quot;, &quot;Wins&quot;, &quot;Losses&quot;, &quot;Wins&quot;, &quot;Losses&quot;), Counts=c(0,1,3,2,1,4,2,2,5,1)) sports ## Students Sport Category Counts ## 1 Matt Tennis Wins 0 ## 2 Matt Tennis Losses 1 ## 3 Ellie Rugby Wins 3 ## 4 Ellie Rugby Losses 2 ## 5 Tim Football Wins 1 ## 6 Tim Football Losses 4 ## 7 Louise Swimming Wins 2 ## 8 Louise Swimming Losses 2 ## 9 Kelly Running Wins 5 ## 10 Kelly Running Losses 1 spread(sports, key=Category, value=Counts) ## Students Sport Losses Wins ## 1 Ellie Rugby 2 3 ## 2 Kelly Running 1 5 ## 3 Louise Swimming 2 2 ## 4 Matt Tennis 1 0 ## 5 Tim Football 4 1 Task 2: The dataframe foods defined below is untidy. Work out why and use spread() to tidy it foods&lt;-data.frame(student=c(&quot;Antoinette&quot;,&quot;Antoinette&quot;,&quot;Taylor&quot;, &quot;Taylor&quot;, &quot;Alexa&quot;, &quot;Alexa&quot;), Category=c(&quot;Dinner&quot;, &quot;Dessert&quot;, &quot;Dinner&quot;, &quot;Dessert&quot;, &quot;Dinner&quot;,&quot;Dessert&quot;), Frequency=c(3,1,4,5,2,1)) The other common way in which data can be untidy is if the columns are values instead of variables. For example, the dataframe below shows the percentages some students got in tests they did in May and June. The data is untidy because the columns May and June are values, not variables. percentages&lt;-data.frame(student=c(&quot;Alejandro&quot;, &quot;Pietro&quot;, &quot;Jane&quot;), &quot;May&quot;=c(90,12,45), &quot;June&quot;=c(80,30,100)) Fortunately, there is a function in the tidyverse packages to deal with this problem too. gather() takes the names of the columns which are values, the key and the value as arguments. This time, the key is the name of the variable with values as column names, and the value is the name of the variable with values spread over multiple columns. Ie: gather(percentages, &quot;May&quot;, &quot;June&quot;, key=&quot;Month&quot;, value = &quot;Percentage&quot;) ## student Month Percentage ## 1 Alejandro May 90 ## 2 Pietro May 12 ## 3 Jane May 45 ## 4 Alejandro June 80 ## 5 Pietro June 30 ## 6 Jane June 100 These examples don’t have much to do with single-cell RNA-seq analysis, but are designed to help illustrate the features of tidy and untidy data. You will find it much easier to analyse your single-cell RNA-seq data if your data is stored in a tidy format. Fortunately, the data structures we commonly use to facilitate single-cell RNA-seq analysis usually encourage store your data in a tidy manner. 3.6.2 What is Rich Data? If you google ‘rich data’, you will find lots of different definitions for this term. In this course, we will use ‘rich data’ to mean data which is generated by combining information from multiple sources. For example, you could make rich data by creating an object in R which contains a matrix of gene expression values across the cells in your single-cell RNA-seq experiment, but also information about how the experiment was performed. Objects of the SingleCellExperiment class, which we will discuss below, are an example of rich data. Typically, Bioconductor packages make use of rich data objects that have many advantages for package developers and users alike. 3.7 What is Bioconductor? From Wikipedia: Bioconductor is a free, open source and open development software project for the analysis and comprehension of genomic data generated by wet lab experiments in molecular biology. Bioconductor is based primarily on the statistical R programming language, but does contain contributions in other programming languages. It has two releases each year that follow the semiannual releases of R. At any one time there is a release version,which corresponds to the released version of R, and a development version, which corresponds to the development version of R. Most users will find the release version appropriate for their needs. We strongly recommend all new comers and even experienced high-throughput data analysts to use well developed and maintained Bioconductor methods and classes. 3.8 SingleCellExperiment class SingleCellExperiment (SCE) is a S4 class for storing data from single-cell experiments. This includes specialized methods to store and retrieve spike-in information, dimensionality reduction coordinates and size factors for each cell, along with the usual metadata for genes and libraries. In practice, an object of this class can be created using its constructor: library(SingleCellExperiment) counts &lt;- matrix(rpois(100, lambda = 10), ncol=10, nrow=10) rownames(counts) &lt;- paste(&quot;gene&quot;, 1:10, sep = &quot;&quot;) colnames(counts) &lt;- paste(&quot;cell&quot;, 1:10, sep = &quot;&quot;) sce &lt;- SingleCellExperiment( assays = list(counts = counts), rowData = data.frame(gene_names = paste(&quot;gene_name&quot;, 1:10, sep = &quot;&quot;)), colData = data.frame(cell_names = paste(&quot;cell_name&quot;, 1:10, sep = &quot;&quot;)) ) sce ## class: SingleCellExperiment ## dim: 10 10 ## metadata(0): ## assays(1): counts ## rownames(10): gene1 gene2 ... gene9 gene10 ## rowData names(1): gene_names ## colnames(10): cell1 cell2 ... cell9 cell10 ## colData names(1): cell_names ## reducedDimNames(0): ## spikeNames(0): In the SingleCellExperiment, users can assign arbitrary names to entries of assays. To assist interoperability between packages, some suggestions for what the names should be for particular types of data are provided by the authors: counts: Raw count data, e.g., number of reads or transcripts for a particular gene. normcounts: Normalized values on the same scale as the original counts. For example, counts divided by cell-specific size factors that are centred at unity. logcounts: Log-transformed counts or count-like values. In most cases, this will be defined as log-transformed normcounts, e.g., using log base 2 and a pseudo-count of 1. cpm: Counts-per-million. This is the read count for each gene in each cell, divided by the library size of each cell in millions. tpm: Transcripts-per-million. This is the number of transcripts for each gene in each cell, divided by the total number of transcripts in that cell (in millions). Each of these suggested names has an appropriate getter/setter method for convenient manipulation of the SingleCellExperiment. For example, we can take the (very specifically named) counts slot, normalise it and assign it to normcounts instead: normcounts(sce) &lt;- log2(counts(sce) + 1) sce ## class: SingleCellExperiment ## dim: 10 10 ## metadata(0): ## assays(2): counts normcounts ## rownames(10): gene1 gene2 ... gene9 gene10 ## rowData names(1): gene_names ## colnames(10): cell1 cell2 ... cell9 cell10 ## colData names(1): cell_names ## reducedDimNames(0): ## spikeNames(0): dim(normcounts(sce)) ## [1] 10 10 head(normcounts(sce)) ## cell1 cell2 cell3 cell4 cell5 cell6 cell7 ## gene1 3.169925 3.169925 2.000000 2.584963 2.584963 3.321928 3.584963 ## gene2 3.459432 1.584963 3.584963 3.807355 3.700440 3.700440 3.000000 ## gene3 3.000000 3.169925 3.807355 3.169925 3.321928 3.321928 3.321928 ## gene4 3.584963 3.459432 3.000000 3.807355 3.700440 3.700440 3.700440 ## gene5 3.906891 3.000000 3.169925 3.321928 3.584963 3.459432 3.807355 ## gene6 3.700440 3.700440 3.584963 4.000000 3.169925 3.000000 3.459432 ## cell8 cell9 cell10 ## gene1 3.321928 3.807355 2.807355 ## gene2 3.807355 3.700440 4.000000 ## gene3 2.584963 4.000000 3.700440 ## gene4 3.169925 3.584963 3.700440 ## gene5 3.807355 2.584963 3.584963 ## gene6 3.321928 3.459432 4.000000 3.9 scater package scater is a R package for single-cell RNA-seq analysis (McCarthy et al. 2017). The package contains several useful methods for quality control, visualisation and pre-processing of data prior to further downstream analysis. scater features the following functionality: Automated computation of QC metrics Transcript quantification from read data with pseudo-alignment Data format standardisation Rich visualizations for exploratory analysis Seamless integration into the Bioconductor universe Simple normalisation methods We highly recommend to use scater for all single-cell RNA-seq analyses and scater is the basis of the first part of the course. As illustrated in the figure below, scater will help you with quality control, filtering and normalization of your expression matrix following mapping and alignment. Keep in mind that this figure represents the original version of scater where an SCESet class was used. In the newest version this figure is still correct, except that SCESet can be substituted with the SingleCellExperiment class. 3.10 Introduction to ggplot2 3.10.1 What is ggplot2? ggplot2 is an R package designed by Hadley Wickham which facilitates data plotting. In this lab, we will touch briefly on some of the features of the package. If you would like to learn more about how to use ggplot2, we would recommend reading “ggplot2 Elegant graphics for data analysis”, by Hadley Wickham. 3.10.2 Principles of ggplot2 Your data must be a dataframe if you want to plot it using ggplot2. Use the aes mapping function to specify how variables in the dataframe map to features on your plot Use geoms to specify how your data should be represented on your graph eg. as a scatterplot, a barplot, a boxplot etc. 3.10.3 Using the aes mapping function The aes function specifies how variables in your dataframe map to features on your plot. To understand how this works, let’s look at an example: library(ggplot2) library(tidyverse) set.seed(1) counts &lt;- as.data.frame(matrix(rpois(100, lambda = 10), ncol=10, nrow=10)) Gene_ids &lt;- paste(&quot;gene&quot;, 1:10, sep = &quot;&quot;) colnames(counts) &lt;- paste(&quot;cell&quot;, 1:10, sep = &quot;&quot;) counts&lt;-data.frame(Gene_ids, counts) counts ## Gene_ids cell1 cell2 cell3 cell4 cell5 cell6 cell7 cell8 cell9 cell10 ## 1 gene1 8 8 3 5 5 9 11 9 13 6 ## 2 gene2 10 2 11 13 12 12 7 13 12 15 ## 3 gene3 7 8 13 8 9 9 9 5 15 12 ## 4 gene4 11 10 7 13 12 12 12 8 11 12 ## 5 gene5 14 7 8 9 11 10 13 13 5 11 ## 6 gene6 12 12 11 15 8 7 10 9 10 15 ## 7 gene7 11 11 14 11 11 5 9 13 13 7 ## 8 gene8 9 12 9 8 6 14 7 12 12 10 ## 9 gene9 14 12 11 7 10 10 8 14 7 10 ## 10 gene10 11 10 9 7 11 16 8 7 7 4 ggplot(data = counts, mapping = aes(x = cell1, y = cell2)) Let’s take a closer look at the final command, ggplot(data = counts, mapping = aes(x = cell1, y = cell2)). ggplot() initialises a ggplot object and takes the arguments data and mapping. We pass our dataframe of counts to data and use the aes() function to specify that we would like to use the variable cell1 as our x variable and the variable cell2 as our y variable. Task 1: Modify the command above to initialise a ggplot object where cell10 is the x variable and cell8 is the y variable. Clearly, the plots we have just created are not very informative because no data is displayed on them. To display data, we will need to use geoms. 3.10.4 Geoms We can use geoms to specify how we would like data to be displayed on our graphs. For example, our choice of geom could specify that we would like our data to be displayed as a scatterplot, a barplot or a boxplot. Let’s see how our graph would look as a scatterplot. ggplot(data = counts, mapping = aes(x = cell1, y = cell2)) + geom_point() Now we can see that there doesn’t seem to be any correlation between gene expression in cell1 and cell2. Given we generated counts randomly, this isn’t too surprising. Task 2: Modify the command above to create a line plot. Hint: execute ?ggplot and scroll down the help page. At the bottom is a link to the ggplot package index. Scroll through the index until you find the geom options. 3.10.5 Plotting data from more than 2 cells So far we’ve been considering the gene counts from 2 of the cells in our dataframe. But there are actually 10 cells in our dataframe and it would be nice to compare all of them. What if we wanted to plot data from all 10 cells at the same time? At the moment we can’t do this because we are treating each individual cell as a variable and assigning that variable to either the x or the y axis. We could create a 10 dimensional graph to plot data from all 10 cells on, but this is a) not possible to do with ggplot and b) not very easy to interpret. What we could do instead is to tidy our data so that we had one variable representing cell ID and another variable representing gene counts, and plot those against each other. In code, this would look like: counts&lt;-gather(counts, colnames(counts)[2:11], key = &#39;Cell_ID&#39;, value=&#39;Counts&#39;) head(counts) ## Gene_ids Cell_ID Counts ## 1 gene1 cell1 8 ## 2 gene2 cell1 10 ## 3 gene3 cell1 7 ## 4 gene4 cell1 11 ## 5 gene5 cell1 14 ## 6 gene6 cell1 12 Essentially, the problem before was that our data was not tidy because one variable (Cell_ID) was spread over multiple columns. Now that we’ve fixed this problem, it is much easier for us to plot data from all 10 cells on one graph. ggplot(counts,aes(x=Cell_ID, y=Counts)) + geom_boxplot() Task 3: Use the updated counts dataframe to plot a barplot with Cell_ID as the x variable and Counts as the y variable. Hint: you may find it helpful to read ?geom_bar. Task 4: Use the updated counts dataframe to plot a scatterplot with Gene_ids as the x variable and Counts as the y variable. 3.10.6 Plotting heatmaps A common method for visualising gene expression data is with a heatmap. Here we will use the R package pheatmap to perform this analysis with some gene expression data we will name test. library(pheatmap) set.seed(2) test = matrix(rnorm(200), 20, 10) test[1:10, seq(1, 10, 2)] = test[1:10, seq(1, 10, 2)] + 3 test[11:20, seq(2, 10, 2)] = test[11:20, seq(2, 10, 2)] + 2 test[15:20, seq(2, 10, 2)] = test[15:20, seq(2, 10, 2)] + 4 colnames(test) = paste(&quot;Cell&quot;, 1:10, sep = &quot;&quot;) rownames(test) = paste(&quot;Gene&quot;, 1:20, sep = &quot;&quot;) pheatmap(test) Let’s take a moment to work out what this graphic is showing us. Each row represents a gene and each column represents a cell. How highly expressed each gene is in each cell is represented by the colour of the corresponding box. For example, we can tell from this plot that gene18 is highly expressed in cell10 but lowly expressed in cell1. This plot also gives us information on the results of a clustering algorithm. In general, clustering algorithms aim to split datapoints (eg.cells) into groups whose members are more alike one another than they are alike the rest of the datapoints. The trees drawn on the top and left hand sides of the graph are the results of clustering algorithms and enable us to see, for example, that cells 4,8,2,6 and 10 are more alike one another than they are alike cells 7,3,5,1 and 9. The tree on the left hand side of the graph represents the results of a clustering algorithm applied to the genes in our dataset. If we look closely at the trees, we can see that eventually they have the same number of branches as there are cells and genes. In other words, the total number of cell clusters is the same as the total number of cells, and the total number of gene clusters is the same as the total number of genes. Clearly, this is not very informative, and will become impractical when we are looking at more than 10 cells and 20 genes. Fortunately, we can set the number of clusters we see on the plot. Let’s try setting the number of gene clusters to 2: pheatmap(test, kmeans_k = 2) Now we can see that the genes fall into two clusters - a cluster of 8 genes which are upregulated in cells 2, 10, 6, 4 and 8 relative to the other cells and a cluster of 12 genes which are downregulated in cells 2, 10, 6, 4 and 8 relative to the other cells. Task 5: Try setting the number of clusters to 3. Which number of clusters do you think is more informative? 3.10.7 Principal Component Analysis Principal component analysis (PCA) is a statistical procedure that uses a transformation to convert a set of observations into a set of values of linearly uncorrelated variables called principal components. The transformation is carried out so that the first principle component accounts for as much of the variability in the data as possible, and each following principle component accounts for the greatest amount of variance possible under the contraint that it must be orthogonal to the previous components. PCA plots are a good way to get an overview of your data, and can sometimes help identify confounders which explain a high amount of the variability in your data. We will investigate how we can use PCA plots in single-cell RNA-seq analysis in more depth in a future lab, here the aim is to give you an overview of what PCA plots are and how they are generated. Let’s make a PCA plot for our test data. We can use the ggfortify package to let ggplot know how to interpret principle components. library(ggfortify) Principal_Components&lt;-prcomp(test) autoplot(Principal_Components, label=TRUE) Task 6: Compare your clusters to the pheatmap clusters. Are they related? (Hint: have a look at the gene tree for the first pheatmap we plotted) Task 7: Produce a heatmap and PCA plot for counts (below): set.seed(1) counts &lt;- as.data.frame(matrix(rpois(100, lambda = 10), ncol=10, nrow=10)) rownames(counts) &lt;- paste(&quot;gene&quot;, 1:10, sep = &quot;&quot;) colnames(counts) &lt;- paste(&quot;cell&quot;, 1:10, sep = &quot;&quot;) References "],
 ["datasets.html", "4 Datasets 4.1 Deng 4.2 Tung 4.3 Pancreas 4.4 Heart 4.5 Thymus 4.6 Tabula Muris 4.7 Introduction 4.8 Downloading the data 4.9 Reading the data (Smartseq2) 4.10 Building a SingleCellExperiment object 4.11 Reading the data (10X) 4.12 Building a SingleCellExperiment object for the 10X data 4.13 Advanced Exercise", " 4 Datasets Here we provide brief descriptions of the core datasets used in this course and a more detailed description of the Tabula Muris (mouse cell atlas) data, how it can be downloaded and how it can be used. 4.1 Deng A single-cell RNA-seq dataset of 268 individual cells dissociated from in vivo F1 embryos from oocyte to blastocyst stages of mouse preimplantation development. Single-cell transcriptome profiles were generated with Smart-seq or Smart-seq2 from each individual cell with spike-ins (NB: both the Smart-seq and Smart-seq2 protocols were used, for different sets of cells in the dataset). Cells annlysed here have been annotated with their developmental stages according to the original publication. Deng, Qiaolin, et al. “Single-cell RNA-seq reveals dynamic, random monoallelic gene expression in mammalian cells.” Science 343.6167 (2014) 193-196. 4.2 Tung A dataset of induced pluripotent stem cells generated from three different individuals with replicates (Tung et al. 2017) in Yoav Gilad’s lab at the University of Chicago. Data generated using Fluidigm C1 platform and to facilitate the quantification both unique molecular identifiers (UMIs) and ERCC spike-ins were used. The data files are located in the tung folder in your working directory. These files are the copies of the original files made on the 15/03/16. We will use these copies for reproducibility purposes. Tung, Po-Yuan, et al. “Batch effects and the effective design of single-cell gene expression studies.” Scientific reports 7 (2017): 39921. 4.3 Pancreas We have included two human pancreas datasets: from Muraro et al (2016) (Muraro et al. 2016) and Segerstolpe et al. (2016) (Segerstolpe et al. 2016). Since the pancreas has been widely studied, these datasets are well annotated. 4.3.1 Muraro Single-cell CEL-seq2 data were generated using a customised automated platform that uses FACS, robotics, and the CEL-Seq2 protocol to obtain the transcriptomes of thousands of single pancreatic cells from four deceased organ donors. Cell surface markers can be used for sorting and enriching certain cell types.(Muraro et al. 2016) Muraro,M.J. et al. (2016) A Single-Cell Transcriptome Atlas of the Human Pancreas. Cell Syst, 3, 385–394.e3. 4.3.2 Segerstolpe Single-cell RNA-seq dataset of human pancreatic cells from patients with type 2 diabetes and healthy controls. Single cells were prepared using Smart-seq2 protocol and sequenced on an Illumina HiSeq 2000.(Segerstolpe et al. 2016) Segerstolpe,Å. et al. (2016) Single-Cell Transcriptome Profiling of Human Pancreatic Islets in Health and Type 2 Diabetes. Cell Metab., 24, 593–607. 4.4 Heart data/sce/Heart_10X.rds is a SCE object containing cells from Heart tissue from the Tabula Muris dataset (details below) using 10X protocol. 4.5 Thymus data/sce/Thymus_10X.rds is a SCE object containing cells from Thymus tissue from the Tabula Muris dataset (details below) using 10X protocol. 4.6 Tabula Muris 4.7 Introduction To give you hands-on experience analyzing from start to finish a single-cell RNASeq dataset we will be using as an example, data from the Tabula Muris initial release. The Tabula Muris is an international collaboration with the aim to profile every cell-type in the mouse using a standardized method. They combine high-throughput but low-coverage 10X data with lower throughput but high-coverage FACS-sorted cells + Smartseq2. The initial release of the data (20 Dec 2017), contains almost 100,000 cells across 20 different tissues/organs. You might like to choose a tissue to focus on for a detailed analysis. 4.8 Downloading the data Unlike most single-cell RNA-seq data Tabula Muris has released their data through the figshare platform rather than uploading it to GEO or ArrayExpress. You can find the data by using the doi’s in their paper : 10.6084/m9.figshare.5715040 for FACS/Smartseq2 and 10.6084/m9.figshare.5715025 for 10X data. The data can be downloaded manually by clinking the doi links or by using the command-line commands below: Terminal-based download of FACS data: wget https://ndownloader.figshare.com/files/10038307 unzip 10038307 wget https://ndownloader.figshare.com/files/10038310 mv 10038310 FACS_metadata.csv wget https://ndownloader.figshare.com/files/10039267 mv 10039267 FACS_annotations.csv Terminal-based download of 10X data: wget https://ndownloader.figshare.com/files/10038325 unzip 10038325 wget https://ndownloader.figshare.com/files/10038328 mv 10038328 droplet_metadata.csv wget https://ndownloader.figshare.com/files/10039264 mv 10039264 droplet_annotation.csv Note if you download the data by hand you should unzip &amp; rename the files as above before continuing. You should now have two folders : “FACS” and “droplet” and one annotation and metadata file for each. To inspect these files you can use the head to see the top few lines of the text files (Press “q” to exit): head -n 10 droplet_metadata.csv You can also check the number of rows in each file using: wc -l droplet_annotation.csv Exercise How many cells do we have annotations for from FACS? from 10X? nn Answer FACS : 54,838 cells Droplet : 42,193 cells 4.9 Reading the data (Smartseq2) We can now read in the relevant count matrix from the comma-separated file. Then inspect the resulting dataframe: dat &lt;- read.delim(&quot;FACS/Kidney-counts.csv&quot;, sep=&quot;,&quot;, header=TRUE) dat[1:5,1:5] We can see that the first column in the dataframe is the gene names, so first we move these to the rownames so we have a numeric matrix: dim(dat) rownames(dat) &lt;- dat[,1] dat &lt;- dat[,-1] Since this is a Smart-seq2 dataset it may contain spike-ins so lets check: rownames(dat)[grep(&quot;^ERCC-&quot;, rownames(dat))] Now we can extract much of the metadata for this data from the column names: cellIDs &lt;- colnames(dat) cell_info &lt;- strsplit(cellIDs, &quot;\\\\.&quot;) Well &lt;- lapply(cell_info, function(x){x[1]}) Well &lt;- unlist(Well) Plate &lt;- unlist(lapply(cell_info, function(x){x[2]})) Mouse &lt;- unlist(lapply(cell_info, function(x){x[3]})) We can check the distributions of each of these metadata classifications: summary(factor(Mouse)) We can also check if any technical factors are confounded: table(Mouse, Plate) Lastly we will read the computationally inferred cell-type annotation and match them to the cell in our expression matrix: ann &lt;- read.table(&quot;FACS_annotations.csv&quot;, sep=&quot;,&quot;, header=TRUE) ann &lt;- ann[match(cellIDs, ann[,1]),] celltype &lt;- ann[,3] 4.10 Building a SingleCellExperiment object To create a SingleCellExperiment object we must put together all the cell annotations into a single dataframe, since the experimental batch (PCR plate) is completely confounded with donor mouse we will only keep one of them. library(&quot;SingleCellExperiment&quot;) library(&quot;scater&quot;) cell_anns &lt;- data.frame(mouse = Mouse, well=Well, type=celltype) rownames(cell_anns) &lt;- colnames(dat) sceset &lt;- SingleCellExperiment(assays = list(counts = as.matrix(dat)), colData=cell_anns) Finally if the dataset contains spike-ins we a hidden variable in the SingleCellExperiment object to track them: isSpike(sceset, &quot;ERCC&quot;) &lt;- grepl(&quot;ERCC-&quot;, rownames(sceset)) 4.11 Reading the data (10X) Due to the large size and sparsity of 10X data (upto 90% of the expression matrix may be 0s) it is typically stored as a sparse matrix. The default output format for CellRanger is an .mtx file which stores this sparse matrix as a column of row coordinates, a column of column corodinates, and a column of expression values &gt; 0. Note if you look at the .mtx file you will see two header lines followed by a line detailing the total number of rows, columns and counts for the full matrix. Since only the coordinates are stored in the .mtx file, the names of each row &amp; column must be stored separately in the “genes.tsv” and “barcodes.tsv” files respectively. We will be using the “Matrix” package to store matrices in sparse-matrix format in R. The SingleCellExperiment class naturally handles parse matrices, and many downstream tools including scater, scran and DropletUtils also handle data stored in sparse matrices, reducing the memory requirements for many early steps in an analysis. The SingleCellExperiment class can also use data in HDF5 format which allows large non-sparse matrices to be stored &amp; accessed on disk in an efficient manner rather than loading the whole thing into RAM. library(&quot;Matrix&quot;) cellbarcodes &lt;- read.table(&quot;droplet/Kidney-10X_P4_5/barcodes.tsv&quot;) genenames &lt;- read.table(&quot;droplet/Kidney-10X_P4_5/genes.tsv&quot;) molecules &lt;- readMM(&quot;droplet/Kidney-10X_P4_5/matrix.mtx&quot;) Now we will add the appropriate row and column names. However, if you inspect the read cellbarcodes you will see that they are just the barcode sequence associated with each cell. This is a problem since each batch of 10X data uses the same pool of barcodes so if we need to combine data from multiple 10X batches the cellbarcodes will not be unique. Hence we will attach the batch ID to each cell barcode: head(cellbarcodes) rownames(molecules) &lt;- genenames[,1] colnames(molecules) &lt;- paste(&quot;10X_P4_5&quot;, cellbarcodes[,1], sep=&quot;_&quot;) Now lets get the metadata and computational annotations for this data: meta &lt;- read.delim(&quot;droplet_metadata.csv&quot;, sep=&quot;,&quot;, header = TRUE) head(meta) Here we can see that we need to use 10X_P4_5 to find the metadata for this batch, also note that the format of the mouse ID is different in this metadata table with hyphens instead of underscores and with the gender in the middle of the ID. From checking the methods section of the accompanying paper we know that the same 8 mice were used for both droplet and plate-based techniques. So we need to fix the mouse IDs to be consistent with those used in the FACS experiments. meta[meta$channel == &quot;10X_P4_5&quot;,] mouseID &lt;- &quot;3_8_M&quot; Note: depending on the tissue you choose you may have 10X data from mixed samples : e.g. mouse id = 3-M-5/6. You should still reformat these to be consistent but they will not match mouse ids from the FACS data which may affect your downstream analysis. If the mice weren’t from an inbred strain it would be possible to assign individual cells to a specific mouse using exonic-SNPs but that is beyond the scope of this course. ann &lt;- read.delim(&quot;droplet_annotation.csv&quot;, sep=&quot;,&quot;, header=TRUE) head(ann) Again you will find a slight formating difference between the cellID in the annotation and the cellbarcodes which we will have to correct before matching them. ann[,1] &lt;- paste(ann[,1], &quot;-1&quot;, sep=&quot;&quot;) ann_subset &lt;- ann[match(colnames(molecules), ann[,1]),] celltype &lt;- ann_subset[,3] Now lets build the cell-metadata dataframe: cell_anns &lt;- data.frame(mouse = rep(mouseID, times=ncol(molecules)), type=celltype) rownames(cell_anns) &lt;- colnames(molecules); Exercise Repeat the above for the other 10X batches for your tissue. Answer 4.12 Building a SingleCellExperiment object for the 10X data Now that we have read the 10X data in multiple batches we need to combine them into a single SingleCellExperiment object. First we will check that the gene names are the same and in the same order across all batches: identical(rownames(molecules1), rownames(molecules2)) identical(rownames(molecules1), rownames(molecules3)) Now we’ll check that there aren’t any repeated cellIDs: sum(colnames(molecules1) %in% colnames(molecules2)) sum(colnames(molecules1) %in% colnames(molecules3)) sum(colnames(molecules2) %in% colnames(molecules3)) Everything is ok, so we can go ahead and combine them: all_molecules &lt;- cbind(molecules1, molecules2, molecules3) all_cell_anns &lt;- as.data.frame(rbind(cell_anns1, cell_anns2, cell_anns3)) all_cell_anns$batch &lt;- rep(c(&quot;10X_P4_5&quot;, &quot;10X_P4_6&quot;,&quot;10X_P7_5&quot;), times = c(nrow(cell_anns1), nrow(cell_anns2), nrow(cell_anns3))) Exercise How many cells are in the whole dataset? Answer Now build the SingleCellExperiment object. One of the advantages of the SingleCellExperiment class is that it is capable of storing data in normal matrix or sparse matrix format, as well as HDF5 format which allows large non-sparse matrices to be stored &amp; accessed on disk in an efficient manner rather than loading the whole thing into RAM. all_molecules &lt;- as.matrix(all_molecules) sceset &lt;- SingleCellExperiment( assays = list(counts = as.matrix(all_molecules)), colData = all_cell_anns ) Since this is 10X data it will not contain spike-ins, so we just save the data: saveRDS(sceset, &quot;kidney_droplet.rds&quot;) 4.13 Advanced Exercise Write an R function/script which will fully automate this procedure for each data-type for any tissue. References "],
 ["processing-raw-scrna-seq-data.html", "5 Processing raw scRNA-seq data 5.1 Generating fastq files from BCLs 5.2 FastQC 5.3 Trimming Reads 5.4 Fastp 5.5 Read alignment and gene expression quantification 5.6 Full-length transcript datasets 5.7 Tag-based datasets 5.8 Practise 5.9 Identifying cell-containing droplets/microwells", " 5 Processing raw scRNA-seq data 5.1 Generating fastq files from BCLs BCLs (Illumina sequencer’s base call files) are binary files with raw sequencing data generated from sequencers. If your data processing starts BCLs you will need to make fastq files from the BCL files. More on BCL format. For others, you may have received the fastq files from your sequencing facilities or collaborators, you can refer to Section 5.2 for pre-processing on fastq files. 5.1.1 Demultiplexing In cases where multiple sample libraries are pooled together for sequencing on one lane of a flowcell to reduce seqeuncing cost, we demultiplex the samples by their sample index in the step of making fastq files from BCLs. Sample indices are ‘barcodes’ for multiplexed samples which have been constructed in the read structure during the library preparation. Figure 2.3: Example 10X Final Library Structure 5.1.2 cellranger mkfastq If you are working with 10X Genomiec data, it is best to use the cellranger mkfastq pipleline, which wraps Illumina’s bcl2fastq and provides a number of convenient features designed specifically for 10X data format. In order to demultiplex samples, you would also need the sample_sheet.csv file which tells the mkfastq pipeline which libraries are sequenced on which lanes of the flowcell and what sample index sets they have. For example when you have multiple libraries sequenced on one lane here: With cellranger mkfastq, you can provide a simpleSampleSheet.csv file that has: Lane Sample Index 1 test_sample SI-P03-C9 1 test_sample2 SI-P03-A3 ... SI-P03-C9 and SI-P03-A3 are the 10x sample index set names. Each of them corresponds to a mix of 4 unique oligonucleotides so that the i7 index read is balanced across all 4 bases during sequencing. There are a list of 96 sample index sets and you can use any ones of them to ‘tag’ your samples. An example command to run cellranger mkfastq /mnt/Software/cellranger/cellranger-3.0.2/cellranger mkfastq \\ --run ./input.runs.folder/ --samplesheet {input.samplesheet.csv} \\ --id run_id \\ --qc --project MAXL_2019_LIM_organoid_RNAseq \\ --output-dir data/fastq_path/ \\ --jobmode=local --localcores=20 --localmem=50 1&gt; {log} After mkfastq, you end up with each sample’s fastq files from each sequencing lanes: test_sample_S1_L001_I1_001.fastq.gz test_sample_S1_L001_R1_001.fastq.gz test_sample_S1_L001_R2_001.fastq.gz test_sample2_S2_L001_I1_001.fastq.gz test_sample2_S2_L001_R1_001.fastq.gz test_sample2_S2_L001_R2_001.fastq.gz 5.1.3 Illumina bcl2fastq You can also use Illumina’s bcl2fastq tool directly and it is more generally applicable. bcf2fastq converts BCLs to fastqs while optionally demultiplexing sequencing data. Find the documentation of the tool here; training videos may also help to come to grips with using this tool. Figure 2.4: Sample Demultiplexing You will need to supply a SampleSheet.csv file like this: Figure 2.5: SampleSheet.csv file This information should come from your sequencing facilities. Running bcl2fastq can then be done like this: /usr/local/bin/bcl2fastq --runfolder-dir &lt;RunFolder&gt; --output-dir &lt;BaseCalls&gt; The output fastq files are names as SampleName_SampleNumber_Lane_Read_001.fastq.gz same with cellranger mkfastq output. (eg: Sample1_S1_L001_R1_001.fastq.gz) 5.2 FastQC Once you’ve obtained your single-cell RNA-seq data, the first thing you need to do with it is check the quality of the reads you have sequenced. For this task, today we will be using a tool called FastQC. FastQC is a quality control tool for sequencing data, which can be used for both bulk and single-cell RNA-seq data. FastQC takes sequencing data as input and returns a report on read quality. Copy and paste this link into your browser to visit the FastQC website: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ This website contains links to download and install FastQC and documentation on the reports produced. Scroll down the webpage to ‘Example Reports’ and click ‘Good Illumina Data’. This gives an example of what an ideal report should look like for high quality Illumina reads data. Now let’s make a FastQC report ourselves. Today we will be performing our analysis using a single cell from an mESC dataset produced by (Kolodziejczyk et al. 2015). The cells were sequenced using the SMART-seq2 library preparation protocol and the reads are paired end. Note You will have to download the files (both ERR522959_1.fastq and ERR522959_2.fastq) and create Share directory yourself to run the commands. You can find the files here: https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-2600/samples/ Now let’s look at the files: less Share/ERR522959_1.fastq less Share/ERR522959_2.fastq Task 1: Try to work out what command you should use to produce the FastQC report. Hint: Try executing fastqc -h This command will tell you what options are available to pass to FastQC. Feel free to ask for help if you get stuck! If you are successful, you should generate a .zip and a .html file for both the forwards and the reverse reads files. Once you have been successful, feel free to have a go at the next section. 5.2.1 Solution and Downloading the Report If you haven’t done so already, generate the FastQC report using the commands below: mkdir fastqc_results fastqc -o fastqc_results Share/ERR522959_1.fastq Share/ERR522959_2.fastq Once the command has finished executing, you should have a total of four files - one zip file for each of the paired end reads, and one html file for each of the paired end reads. The report is in the html file. To view it, we will need to get it onto your computer using either filezilla or scp. Ask an instructor if you are having difficulties. Once the file is on you computer, click on it. Your FastQC report should open. Have a look through the file. Remember to look at both the forwards and the reverse end read reports! How good quality are the reads? Is there anything we should be concerned about? How might we address those concerns? 5.2.2 10X fastq qualities checks If you have generated the fastq files from cellranger mkfastq as we discussed before, you can get a list of quality metrics from the output files. First thing to look at will be qc_summary.json file which contains lines like this: &quot;sample_qc&quot;: { &quot;Sample1&quot;: { &quot;5&quot;: { &quot;barcode_exact_match_ratio&quot;: 0.9336158258904611, &quot;barcode_q30_base_ratio&quot;: 0.9611993091728814, &quot;bc_on_whitelist&quot;: 0.9447542078230667, &quot;mean_barcode_qscore&quot;: 37.770630795934, &quot;number_reads&quot;: 2748155, &quot;read1_q30_base_ratio&quot;: 0.8947676653366835, &quot;read2_q30_base_ratio&quot;: 0.7771883245304577 }, &quot;all&quot;: { &quot;barcode_exact_match_ratio&quot;: 0.9336158258904611, &quot;barcode_q30_base_ratio&quot;: 0.9611993091728814, &quot;bc_on_whitelist&quot;: 0.9447542078230667, &quot;mean_barcode_qscore&quot;: 37.770630795934, &quot;number_reads&quot;: 2748155, &quot;read1_q30_base_ratio&quot;: 0.8947676653366835, &quot;read2_q30_base_ratio&quot;: 0.7771883245304577 } } } 5.3 Trimming Reads Fortunately there is software available for read trimming. Today we will be using Trim Galore!. Trim Galore! is a wrapper for the reads trimming software cutadapt and fastqc. Read trimming software can be used to trim sequencing adapters and/or low quality reads from the ends of reads. Given we noticed there was some adaptor contamination in our FastQC report, it is a good idea to trim adaptors from our data. Task 2: What type of adapters were used in our data? Hint: Look at the FastQC report ‘Adapter Content’ plot. Now let’s try to use Trim Galore! to remove those problematic adapters. It’s a good idea to check read quality again after trimming, so after you have trimmed your reads you should use FastQC to produce another report. Task 3: Work out the command you should use to trim the adapters from our data. Hint 1: You can use the following command to find out what options you can pass to Trim Galore. trim_galore -h _Hint 2:** Read through the output of the above command carefully. The adaptor used in this experiment is quite common. Do you need to know the actual sequence of the adaptor to remove it? Task 3: Produce a FastQC report for your trimmed reads files. Is the adapter contamination gone? Once you think you have successfully trimmed your reads and have confirmed this by checking the FastQC report, feel free to check your results using the next section. 5.3.1 Solution You can use the command(s) below to trim the Nextera sequencing adapters: mkdir fastqc_trimmed_results trim_galore --nextera -o fastqc_trimmed_results Share/ERR522959_1.fastq Share/ERR522959_2.fastq Remember to generate new FastQC reports for your trimmed reads files! FastQC should now show that your reads pass the ‘Adaptor Content’ plot. Feel free to ask one of the instructors if you have any questions. Congratulations! You have now generated reads quality reports and performed adaptor trimming. In the next lab, we will use STAR and Kallisto to align our trimmed and quality-checked reads to a reference transcriptome. 5.4 Fastp Fastp is an ‘all-in-one’ pre-processing tool to run on fastq files which has integrated a lot aspects of quality profiling for both before and after filtering data (quality curves, base contents, KMER, Q20/Q30, GC Ratio, duplication, adapter contents and etc. Example usage: mkdir fastp_results fastp -i Share/ERR522959_1.fastq -I Share/ERR522959_2.fastq \\ -o fastp_results/ERR522959_1.fastp.fastq -O fastp_results/ERR522959_1.fastp.fastq \\ --length_required 20 --average_qual 20 --detect_adapter_for_pe --correction \\ -h fastp_results/ERR522959.html -j fastp_results/ERR522959.json 5.5 Read alignment and gene expression quantification Now we have trimmed our reads and established that they are of good quality, we would like to map them to a reference genome. This process is known as alignment. Some form of alignment is generally required if we want to quantify gene expression or find genes which are differentially expressed between samples. Many tools have been developed for read alignment. STAR (Dobin et al. 2013) is one of most popularly used tools in RNA-seq read alignment. There are a bunch of alignment and gene expression quantification tools that are designed specifically for single-cell RNA-seq data too. Depending on your single-cell RNA-seq protocols used and the datasets generated, you can go with the following two workflows presented here for full-length or tag-based datasets. Thus, today we will focus on two alignment tools: STAR and Kallisto-BUStools, and we will discuss other available tools at the end of this chapter. 5.6 Full-length transcript datasets If your single-cell RNA-seq dataset is from plate-based protocol like Smart-seq2, then your dataset can be aligned and quantifified just like a bulk RNA-seq datasest. Each cell has a proper pair (if it’s paired-end sequncing) of fastq files (there are no CB/UMI tags in the reads). STAR is a good choice for alignment (other good choices could be Subread or Hisat2). 5.6.1 Using STAR to align reads STAR tries to find the longest possible sequence which matches one or more sequences in the reference genome. For example, in the figure below, we have a read (blue) which spans two exons and an alternative splicing junction (purple). STAR finds that the first part of the read is the same as the sequence of the first exon, whilst the second part of the read matches the sequence in the second exon. Because STAR is able to recognise splicing events in this way, it is described as a ‘splice aware’ aligner. Figure 2.3: Diagram of how STAR performs alignments, taken from Dobin et al. Usually STAR aligns reads to a reference genome, potentially allowing it to detect novel splicing events or chromosomal rearrangements. 5.6.2 Expression quantification Now you have your aligned reads in a .bam file for your single cells. The next step is to quantify the expression level of each gene per cell. We can use one of the tools which has been developed for bulk RNA-seq data, e.g. HT-seq or FeatureCounts which do ‘simple’ counting of reads overlapping with genomic features. Here we demostrate an example with featureCounts, that counts mapped reads for genomic features such as genes, exons, promoter, gene bodies, genomic bins and chromosomal locations. # include multimapping &lt;featureCounts_path&gt;/featureCounts -O -M -Q 30 -p -a hg_annotations.gtf -o outputfile ERR522959.bam # exclude multimapping &lt;featureCounts_path&gt;/featureCounts -Q 30 -p -a hg_annotations.gtf -o outputfile ERR522959.bam Then you will have your read counts gene expression matrix that’s ready for downstream analysis. 5.7 Tag-based datasets If your dataset is tag-based, for example 10X dataset, then you typically have sequences in R1 that entirely encode for read identities such as Cell Barcode and UMI tags. In most cases, all your cell reads (for 1k-10K cells) are in one set of fastq files. Instead of trying to demultiplex all cells into separate fastqs then do alignment and quantification, we can use tools that take care of this for you. In the following steps, we use Kallisto with bustools for generating the gene expression quantification matrix for your tag-based datasets. 5.7.1 Cellranger count If you work with 10X dataset, cellranger count pipeline may just work well for you. It comes with cellranger software suite with convenient features for 10X datasets. It takes the fastqs of a sample, and uses STAR to align all cells’ reads. It also includes reads filtering, barcode counting, and UMI counting. The output of this pipeline includes the aligned.bam file and the quantified gene expression matrix in both filtered and raw format. In V3, it has adopted the EmptyDroplet method (Lun et al., 2018), an algorithm that tries distinguish true cell barcodes from barcodes associated with droplets that did not contain a cell (i.e. empty droplets). (More details in Section 5.9 ). The filtered gene expression matrix by cellranger count V3 only includes the true cell barcodes determined by EmptyDroplet method. 5.7.2 Kallisto/bustools and pseudo-alignment STAR is a reads aligner, whereas Kallisto is a pseudo-aligner (Bray et al. 2016). The main difference between aligners and pseudo-aligners is that whereas aligners map reads to a reference, pseudo-aligners map k-mers to a reference. 5.7.3 What is a k-mer? A k-mer is a sequence of length k derived from a read. For example, imagine we have a read with the sequence ATCCCGGGTTAT and we want to make 7-mers from it. To do this, we would find the first 7-mer by counting the first seven bases of the read. We would find the second 7-mer by moving one base along, then counting the next seven bases. Below shows all the 7-mers that could be derived from our read: ATCCCGGGTTAT ATCCCGG TCCCGGG CCCGGGT CCGGGTT CGGGTTA GGGTTAT 5.7.4 Why map k-mers rather than reads? There are two main reasons: Pseudo-aligners use k-mers and a computational trick to make pseudo-alignment much faster than traditional aligners. If you are interested in how this is acheived, see (Bray et al. 2016) for details. Under some circumstances, pseudo-aligners may be able to cope better with sequencing errors than traditional aligners. For example, imagine there was a sequencing error in the first base of the read above and the A was actually a T. This would impact on the pseudo-aligners ability to map the first 7-mer but none of the following 7-mers. 5.7.5 Kallisto’s pseudo mode Kallisto has a specially designed mode for pseudo-aligning reads from single-cell RNA-seq experiments. Unlike STAR, Kallisto psuedo-aligns to a reference transcriptome rather than a reference genome. This means Kallisto maps reads to splice isoforms rather than genes. Mapping reads to isoforms rather than genes is especially challenging for single-cell RNA-seq for the following reasons: Single-cell RNA-seq is lower coverage than bulk RNA-seq, meaning the total amount of information available from reads is reduced. Many single-cell RNA-seq protocols have 3’ coverage bias, meaning if two isoforms differ only at their 5’ end, it might not be possible to work out which isoform the read came from. Some single-cell RNA-seq protocols have short read lengths, which can also mean it is not possible to work out which isoform the read came from. Kallisto’s pseudo mode takes a slightly different approach to pseudo-alignment. Instead of aligning to isoforms, Kallisto aligns to equivalence classes. Essentially, this means if a read maps to multiple isoforms, Kallisto records the read as mapping to an equivalence class containing all the isoforms it maps to. Figure 2 shows a diagram which helps explain this. Figure 2.4: Overview of kallisto, The input consists of a reference transcriptome and reads from an RNA-seq experiment. (a) An example of a read (in black) and three overlapping transcripts with exonic regions as shown. (b) An index is constructed by creating the transcriptome de Bruijn Graph (T-DBG) where nodes (v1, v2, v3, … ) are k-mers, each transcript corresponds to a colored path as shown and the path cover of the transcriptome induces a k-compatibility class for each k-mer. (c) Conceptually, the k-mers of a read are hashed (black nodes) to find the k-compatibility class of a read. (d) Skipping (black dashed lines) uses the information stored in the T-DBG to skip k-mers that are redundant because they have the same k-compatibility class. (e) The k-compatibility class of the read is determined by taking the intersection of the k-compatibility classes of its constituent k-mers. Taken from Bray et al (2016). Figure 2.5: A diagram explaining Kallisto’s Equivalence Classes, taken from Ntranos et al. Note Instead of using gene or isoform expression estimates in downstream analysis such as clustering, equivalence class counts can be used instead, in this course, we focus on using gene level estimation. 5.7.6 Running kallisto pseudo-alignment and BUStools Today, we will talk about doing single-cell pseudo-alignemnt and gene level quantification with Kallisto|BUStools. See https://pachterlab.github.io/kallisto/manual for details. As for STAR, you will need to produce an index for Kallisto before the pseudo-alignment step. Use the below command to produce the Kallisto index. Use the Kallisto manual (https://pachterlab.github.io/kallisto/manual) to work out what the options do in this command. mkdir indices/Kallisto kallisto index -i indices/Kallisto/GRCm38.idx Share/mouse/Ensembl.GRCm38.96/Mus_musculus.GRCm38.cdna.all.fa.gz In this step, an index is constructed by creating the transcriptome de Bruijn Graph (T-DBG). 5.7.6.1 BUS format BUS is a binary file format designed for UMI-tagged single-cell datasets with pseudo-aligned reads labelled with CB and UMI tags. Figure 2.7: BUS format, taken from Melsted,Páll et al. We do kallisto bus on the fastqs of single cells to generate the BUS file and then use BUStools on the generated bus files to get a gene level quantification. Check the list of technologies supported by Kallisto BUStools by kallisto bus -l Use the below command to perform pseudo-alignment and generate bus files for single-cell sequencing data. -x argument specifies the technology. List of supported single-cell technologies short name description ---------- ----------- 10xv1 10x version 1 chemistry 10xv2 10x version 2 chemistry 10xv3 10x version 3 chemistry CELSeq CEL-Seq CELSeq2 CEL-Seq version 2 DropSeq DropSeq inDrops inDrops SCRBSeq SCRB-Seq SureCell SureCell for ddSEQ mkdir results/Kallisto kallisto bus -i indices/Kallisto/GRCm38.idx -o results/Kallisto/output_bus -x &#39;10xv2&#39; -t 4 \\ SI-GA-G1/W11_S1_L001_R1_001.fastq.gz SI-GA-G1/W11_S1_L001_R2_001.fastq.gz \\ SI-GA-G1/W11_S1_L002_R1_001.fastq.gz SI-GA-G1/W11_S1_L002_R2_001.fastq.gz See https://pachterlab.github.io/kallisto/manual for instructions on creating bus files. 5.7.7 Understanding the Output of Kallisto BUS Pseudo-Alignment The command above should produce 4 files - matrix.ec, transcripts.txt, run_info.json and output.bus transcripts.txt contains a list of transcript, in the same order as in the transcriptome fasta file. matrix.ec contains information about the equivalence classes used. The first number in each row is the equivalence class ID. The second number(s) correspond to the transcript ID(s) in that equivalence class. For example “10 1,2,3” would mean that equivalence class 10 contains transcript IDs 1,2 and 3. The ID numbers correspond to the order that the transcripts appear in transcripts.txt. Zero indexing is used, meaning transcript IDs 1,2 and 3 correspond to the second, third and fourth transcripts in transcripts.txt. output.bus contains the binary formated Cell Barcode and UMI tags and Sets of equivalent classes of transcripts obtained by pseudoalignment.(The fourth column is count of reads with this barcode, UMI, and equivalence class combination, which is ignored as one UMI should stand for one molecule.) run_info.json contains information about how Kallisto was executed and can be ignored. 5.7.8 Running Bustools Inputs: transcripts_to_genes.tsv: a tab delimited file of a specific format: No headers, first column is transcript ID, and second column is the corresponding gene ID. Transcript IDs must be in the same order as in the kallisto index. barcode whitelist: A whitelist that contains all the barcodes known to be present in the kit is provided by 10x and comes with CellRanger. First, bustools runs barcode error correction on the bus file. Then, the corrected bus file is sorted by barcode, UMI, and equivalence classes. After that the UMIs are counted and the counts are collapsed to the gene level. mkdir ./output/out_bustools/genecount ./tmp bustools correct -w ./data/whitelist_v2.txt -p ./output/out_bustools/output.bus | \\ bustools sort -T tmp/ -t 4 -p - | \\ bustools count -o ./output/out_bustools/genecount/genes -g ./output/tr2g_hgmm.tsv \\ -e ./output/out_bustools/matrix.ec -t ./output/out_bustools/transcripts.txt --genecounts - The output includes: genes.barcodes.txt genes.genes.txt genes.mtx 5.7.9 Other alignment and quantification tools available Alevin Alevin is a tool for 10X and Drop-seq data that comes with Salmon which is also a ‘pseudo-aligner’ for transcriptome quantification. Salmon is conceptually simiarly to Kallisto but uses different models for parameter estimation and account for sequence (3’ 5’-end and Fragment GC) bias correction. STARsolo STARsolo is integrated with STAR. It does mapping, demultiplexing and gene quantification for droplet-based single-cell RNA-seq (eg. 10X genomics). It follows a similar logic as Cellranger count pipeline which does error correction, UMI deduplication and then quantify expression per gene for each cell by counting reads with different UMIs mapped per gene. STARsolo is potentially ten times faster than Cellranger count. If you are interested, here is a paper by Páll et al that compares performance of workflows in single-cell RNA-seq preprocessing. (https://www.biorxiv.org/content/10.1101/673285v2.full). 5.7.10 Summary Full-transcripts dataset: STAR -&gt; featureCounts Tag-based dataset: Kallisto bus -&gt; Bustools 5.8 Practise 5.8.1 Using STAR One issue with STAR is that it needs a lot of RAM, especially if your reference genome is large (eg. mouse and human). To speed up our analysis today, we will use STAR to align reads to a reference genome. Two steps are required to perform STAR alignment. In the first step, the user provides STAR with reference genome sequences (FASTA) and annotations (GTF), which STAR uses to create a genome index. In the second step, STAR maps the user’s reads data to the genome index. Let’s create the index now. You can obtain genomes for many model organisms from Ensembl (https://www.ensembl.org/info/data/ftp/index.html**. Task 1: Execute the commands below to create the index: mkdir indices mkdir indices/STAR STAR --runThreadN 4 --runMode genomeGenerate --genomeDir indices/STAR --genomeFastaFiles Share/hg19.fa --sjdbGTFfile Share/hg_annotations.gtf Task 2: What does each of the options we used do? Hint: Use the STAR manual to help you (https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf** Task 3: How would the command we used in Task 1 be different if we were aligning to the genome rather than the transcriptome? Now that we have created the index, we can perform the mapping step. Task 4: Try to work out what command you should use to map our trimmed reads (from ERR522959** to the index you created. Use the STAR manual to help you. One you think you know the answer, check whether it matches the solution in the next section and execute the alignment. Task 5: Try to understand the output of your alignment. Talk to one of the instructors if you need help! 5.8.2 Solution for STAR Alignment You can use the folowing commands to perform the mapping step: mkdir results mkdir results/STAR STAR --runThreadN 4 --genomeDir indices/STAR --readFilesIn Share/ERR522959_1.fastq Share/ERR522959_2.fastq \\ --outFileNamePrefix results/STAR/ERR522959 5.9 Identifying cell-containing droplets/microwells For droplet based methods only a fraction of droplets contain both beads and an intact cell. However, biology experiments are messy and some RNA will leak out of dead/damaged cells. So droplets without an intact cell are likely to capture a small amount of the ambient RNA which will end up in the sequencing library and contribute a reads to the final sequencing output. The variation in droplet size, amplification efficiency, and sequencing will lead both “background” and real cells to have a wide range of library sizes. Various approaches have been used to try to distinguish those cell barcodes which correspond to real cells. 5.9.1 ‘Knee’ point One of the most used methods use the total molecules (could be applied to total reads) per barcode and try to find a “break point” between bigger libraries which are cells + some background and smaller libraries assumed to be purely background. Let’s load some example simulated data which contain both large and small cells: umi_per_barcode &lt;- read.table(&quot;data/droplet_id_example_per_barcode.txt.gz&quot;) truth &lt;- read.delim(&quot;data/droplet_id_example_truth.gz&quot;, sep=&quot;,&quot;) Exercise How many unique barcodes were detected? How many true cells are present in the data? To simplify calculations for this section exclude all barcodes with fewer than 10 total molecules. Answer One approach is to look for the inflection point where the total molecules per barcode suddenly drops: barcode_rank &lt;- rank(-umi_per_barcode[,2]) plot(barcode_rank, umi_per_barcode[,2], xlim=c(1,8000)) Here we can see an roughly exponential curve of library sizes, so to make things simpler lets log-transform them. log_lib_size &lt;- log10(umi_per_barcode[,2]) plot(barcode_rank, log_lib_size, xlim=c(1,8000)) That’s better, the “knee” in the distribution is much more pronounced. We could manually estimate where the “knee” is but it much more reproducible to algorithmically identify this point. # inflection point o &lt;- order(barcode_rank) log_lib_size &lt;- log_lib_size[o] barcode_rank &lt;- barcode_rank[o] rawdiff &lt;- diff(log_lib_size)/diff(barcode_rank) inflection &lt;- which(rawdiff == min(rawdiff[100:length(rawdiff)], na.rm=TRUE)) plot(barcode_rank, log_lib_size, xlim=c(1,8000)) abline(v=inflection, col=&quot;red&quot;, lwd=2) threshold &lt;- 10^log_lib_size[inflection] cells &lt;- umi_per_barcode[umi_per_barcode[,2] &gt; threshold,1] TPR &lt;- sum(cells %in% truth[,1])/length(cells) Recall &lt;- sum(cells %in% truth[,1])/length(truth[,1]) c(TPR, Recall) ## [1] 1.0000000 0.7831707 5.9.2 Mixture model Another is to fix a mixture model and find where the higher and lower distributions intersect. However, data may not fit the assumed distributions very well: set.seed(-92497) # mixture model require(&quot;mixtools&quot;) ## Loading required package: mixtools ## mixtools package, version 1.1.0, Released 2017-03-10 ## This package is based upon work supported by the National Science Foundation under Grant No. SES-0518772. mix &lt;- normalmixEM(log_lib_size) ## number of iterations= 43 plot(mix, which=2, xlab2=&quot;log(mol per cell)&quot;) p1 &lt;- dnorm(log_lib_size, mean=mix$mu[1], sd=mix$sigma[1]) p2 &lt;- dnorm(log_lib_size, mean=mix$mu[2], sd=mix$sigma[2]) if (mix$mu[1] &lt; mix$mu[2]) { split &lt;- min(log_lib_size[p2 &gt; p1]) } else { split &lt;- min(log_lib_size[p1 &gt; p2]) } Exercise Identify cells using this split point and calculate the TPR and Recall. Answer 5.9.3 Expected Number of Cells A third method used by CellRanger V2, assumes a ~10-fold range of library sizes for real cells and estimates this range using the expected number of cells. n_cells &lt;- length(truth[,1]) # CellRanger v2 totals &lt;- umi_per_barcode[,2] totals &lt;- sort(totals, decreasing = TRUE) # 99th percentile of top n_cells divided by 10 thresh = totals[round(0.01*n_cells)]/10 plot(totals, xlim=c(1,8000)) abline(h=thresh, col=&quot;red&quot;, lwd=2) Exercise Identify cells using this threshodl and calculate the TPR and Recall. Answer 5.9.4 EmptyDroplets Finally (EmptyDrops)[https://github.com/MarioniLab/DropletUtils] is what we recommend using in calling cell barcodes for droplet-based single-cell datasets. It should be noted that in cellranger count v3, EmptyDroptlet algoritms has been applied in their filtering for true cell barcode step. Instead of trying to find a ‘threshold’ in UMIs counts for determining true cells, EmptyDroplet uses the full genes x cells molecule count matrix for all droplets and estimates the profile of “background” RNA from those droplets with extremely low counts, then looks for cells with gene-expression profiles which differ from the background. This is combined with an inflection point method since background RNA often looks very similar to the expression profile of the largests cells in a population. As such EmptyDrops is the only method able to identify barcodes for very small cells in highly diverse samples. Below we have provided code for how this method is currently run: library(&quot;Matrix&quot;) raw.counts &lt;- readRDS(&quot;data/pancreas/muraro.rds&quot;) library(&quot;DropletUtils&quot;) example(write10xCounts, echo=FALSE) dir.name &lt;- tmpdir list.files(dir.name) sce &lt;- read10xCounts(dir.name) sce my.counts &lt;- DropletUtils:::simCounts() br.out &lt;- barcodeRanks(my.counts) # Making a plot. plot(br.out$rank, br.out$total, log=&quot;xy&quot;, xlab=&quot;Rank&quot;, ylab=&quot;Total&quot;) o &lt;- order(br.out$rank) lines(br.out$rank[o], br.out$fitted[o], col=&quot;red&quot;) abline(h=metadata(br.out)$knee, col=&quot;dodgerblue&quot;, lty=2) abline(h=metadata(br.out)$inflection, col=&quot;forestgreen&quot;, lty=2) legend(&quot;bottomleft&quot;, lty=2, col=c(&quot;dodgerblue&quot;, &quot;forestgreen&quot;), legend=c(&quot;knee&quot;, &quot;inflection&quot;)) # emptyDrops set.seed(100) e.out &lt;- emptyDrops(my.counts) is.cell &lt;- e.out$FDR &lt;= 0.01 sum(is.cell, na.rm=TRUE) plot(e.out$Total, -e.out$LogProb, col=ifelse(is.cell, &quot;red&quot;, &quot;black&quot;), xlab=&quot;Total UMI count&quot;, ylab=&quot;-Log Probability&quot;) # plot(e.out$Total, -e.out$LogProb, col=ifelse(is.cell, &quot;red&quot;, &quot;black&quot;), # xlab=&quot;Total UMI count&quot;, ylab=&quot;-Log Probability&quot;) # # cells &lt;- colnames(raw.counts)[is.cell] # # TPR &lt;- sum(cells %in% truth[,1])/length(cells) # Recall &lt;- sum(cells %in% truth[,1])/length(truth[,1]) # c(TPR, Recall) References "],
-["quality-control-and-data-visualisation.html", "6 Quality control and data visualisation 6.1 Expression QC overview (UMI) 6.2 Cell QC 6.3 Doublet detection 6.4 Gene QC 6.5 Exercise: Expression QC (Reads) 6.6 Data visualization and exploratory data analysis 6.7 Exercise: Data visualization (Reads)", " 6 Quality control and data visualisation The principle of garbage in, garbage out is at least as strong in single-cell genomics as it is elsewere in science. Effective quality control (QC) is crucial to high-quality scRNA-seq data analysis. We discuss principles and strategies for QC in this chapter, along with some discussion and demonstration of data visualisation approaches. 6.1 Expression QC overview (UMI) 6.1.1 Introduction Once gene expression has been quantified it is summarized as an expression matrix where each row corresponds to a gene (or transcript) and each column corresponds to a single cell. This matrix should be examined to remove poor quality cells which were not detected in either read QC or mapping QC steps. Failure to remove low quality cells at this stage may add technical noise which has the potential to obscure the biological signals of interest in the downstream analysis. Since there is currently no standard method for performing scRNASeq the expected values for the various QC measures that will be presented here can vary substantially from experiment to experiment. Thus, to perform QC we will be looking for cells which are outliers with respect to the rest of the dataset rather than comparing to independent quality standards. Consequently, care should be taken when comparing quality metrics across datasets collected using different protocols. 6.1.2 Tung dataset To illustrate cell QC, we consider a dataset of induced pluripotent stem cells generated from three different individuals (Tung et al. 2017) in Yoav Gilad’s lab at the University of Chicago. The experiments were carried out on the Fluidigm C1 platform and to facilitate the quantification both unique molecular identifiers (UMIs) and ERCC spike-ins were used. The data files are located in the tung folder in your working directory. These files are the copies of the original files made on the 15/03/16. We will use these copies for reproducibility purposes. library(SingleCellExperiment) library(scater) options(stringsAsFactors = FALSE) Load the data and annotations: molecules &lt;- read.table(&quot;data/tung/molecules.txt&quot;, sep = &quot;\\t&quot;) anno &lt;- read.table(&quot;data/tung/annotation.txt&quot;, sep = &quot;\\t&quot;, header = TRUE) Inspect a small portion of the expression matrix head(molecules[ , 1:3]) ## NA19098.r1.A01 NA19098.r1.A02 NA19098.r1.A03 ## ENSG00000237683 0 0 0 ## ENSG00000187634 0 0 0 ## ENSG00000188976 3 6 1 ## ENSG00000187961 0 0 0 ## ENSG00000187583 0 0 0 ## ENSG00000187642 0 0 0 head(anno) ## individual replicate well batch sample_id ## 1 NA19098 r1 A01 NA19098.r1 NA19098.r1.A01 ## 2 NA19098 r1 A02 NA19098.r1 NA19098.r1.A02 ## 3 NA19098 r1 A03 NA19098.r1 NA19098.r1.A03 ## 4 NA19098 r1 A04 NA19098.r1 NA19098.r1.A04 ## 5 NA19098 r1 A05 NA19098.r1 NA19098.r1.A05 ## 6 NA19098 r1 A06 NA19098.r1 NA19098.r1.A06 The data consists of 3 individuals and r length(unique(anno$replicate)) replicates and therefore has r length(unique(anno$batch)) batches in total. We standardize the analysis by using both SingleCellExperiment (SCE) and scater packages. First, create the SCE object: umi &lt;- SingleCellExperiment( assays = list(counts = as.matrix(molecules)), colData = anno ) Remove genes that are not expressed in any cell: keep_feature &lt;- rowSums(counts(umi) &gt; 0) &gt; 0 umi &lt;- umi[keep_feature, ] Define control features (genes) - ERCC spike-ins and mitochondrial genes (provided by the authors): isSpike(umi, &quot;ERCC&quot;) &lt;- grepl(&quot;^ERCC-&quot;, rownames(umi)) isSpike(umi, &quot;MT&quot;) &lt;- rownames(umi) %in% c(&quot;ENSG00000198899&quot;, &quot;ENSG00000198727&quot;, &quot;ENSG00000198888&quot;, &quot;ENSG00000198886&quot;, &quot;ENSG00000212907&quot;, &quot;ENSG00000198786&quot;, &quot;ENSG00000198695&quot;, &quot;ENSG00000198712&quot;, &quot;ENSG00000198804&quot;, &quot;ENSG00000198763&quot;, &quot;ENSG00000228253&quot;, &quot;ENSG00000198938&quot;, &quot;ENSG00000198840&quot;) Calculate the quality metrics: umi &lt;- calculateQCMetrics( umi, feature_controls = list( ERCC = isSpike(umi, &quot;ERCC&quot;), MT = isSpike(umi, &quot;MT&quot;) ) ) ## Warning in calculateQCMetrics(umi, feature_controls = list(ERCC = ## isSpike(umi, : spike-in set &#39;ERCC&#39; overwritten by feature_controls set of ## the same name 6.2 Cell QC 6.2.1 Library size Next we consider the total number of RNA molecules detected per sample (if we were using read counts rather than UMI counts this would be the total number of reads). Wells with few reads/molecules are likely to have been broken or failed to capture a cell, and should thus be removed. hist( umi$total_counts, breaks = 100 ) abline(v = 25000, col = &quot;red&quot;) Figure 6.1: Histogram of library sizes for all cells Exercise 1 How many cells does our filter remove? What distribution do you expect that the total number of molecules for each cell should follow? Our answer ## filter_by_total_counts ## FALSE TRUE ## 46 818 6.2.2 Detected genes In addition to ensuring sufficient sequencing depth for each sample, we also want to make sure that the reads are distributed across the transcriptome. Thus, we count the total number of unique genes detected in each sample. hist( umi$total_features_by_counts, breaks = 100 ) abline(v = 7000, col = &quot;red&quot;) Figure 6.2: Histogram of the number of detected genes in all cells From the plot we conclude that most cells have between 7,000-10,000 detected genes, which is normal for high-depth scRNA-seq. However, this varies by experimental protocol and sequencing depth. For example, droplet-based methods or samples with lower sequencing-depth typically detect fewer genes per cell. The most notable feature in the above plot is the “heavy tail” on the left hand side of the distribution. If detection rates were equal across the cells then the distribution should be approximately normal. Thus we remove those cells in the tail of the distribution (fewer than 7,000 detected genes). Exercise 2 How many cells does our filter remove? Our answer ## filter_by_expr_features ## FALSE TRUE ## 116 748 6.2.3 ERCCs and MTs Another measure of cell quality is the ratio between ERCC spike-in RNAs and endogenous RNAs. This ratio can be used to estimate the total amount of RNA in the captured cells. Cells with a high level of spike-in RNAs had low starting amounts of RNA, likely due to the cell being dead or stressed which may result in the RNA being degraded. plotColData( umi, x = &quot;total_features_by_counts&quot;, y = &quot;pct_counts_MT&quot;, colour = &quot;batch&quot; ) Figure 6.3: Percentage of counts in MT genes plotColData( umi, x = &quot;total_features_by_counts&quot;, y = &quot;pct_counts_ERCC&quot;, colour = &quot;batch&quot; ) Figure 6.4: Percentage of counts in ERCCs The above analysis shows that majority of the cells from NA19098.r2 batch have a very high ERCC/Endo ratio. Indeed, it has been shown by the authors that this batch contains cells of smaller size. Exercise 3 Create filters for removing batch NA19098.r2 and cells with high expression of mitochondrial genes (&gt;10% of total counts in a cell). Our answer ## filter_by_ERCC ## FALSE TRUE ## 96 768 ## filter_by_MT ## FALSE TRUE ## 31 833 Exercise 4 What would you expect to see in the ERCC vs counts plot if you were examining a dataset containing cells of different sizes (eg. normal &amp; senescent cells)? Answer You would expect to see a group corresponding to the smaller cells (normal) with a higher fraction of ERCC reads than a separate group corresponding to the larger cells (senescent). 6.2.4 Cell filtering 6.2.4.1 Manual Now we can define a cell filter based on our previous analysis: umi$use &lt;- ( # sufficient features (genes) filter_by_expr_features &amp; # sufficient molecules counted filter_by_total_counts &amp; # sufficient endogenous RNA filter_by_ERCC &amp; # remove cells with unusual number of reads in MT genes filter_by_MT ) table(umi$use) ## ## FALSE TRUE ## 207 657 6.2.4.2 Automatic Another option available in scater is to conduct PCA on a set of QC metrics and then use automatic outlier detection to identify potentially problematic cells. By default, the following metrics are used for PCA-based outlier detection: pct_counts_top_100_features total_features pct_counts_feature_controls n_detected_feature_controls log10_counts_endogenous_features log10_counts_feature_controls scater first creates a matrix where the rows represent cells and the columns represent the different QC metrics. Then, outlier cells can also be identified by using the mvoutlier package on the QC metrics for all cells. This will identify cells that have substantially different QC metrics from the others, possibly corresponding to low-quality cells. We can visualize any outliers using a principal components plot as shown below: umi &lt;- runPCA( umi, use_coldata = TRUE, detect_outliers = TRUE ) reducedDimNames(umi) ## [1] &quot;PCA_coldata&quot; Column subsetting can then be performed based on the $outlier slot, which indicates whether or not each cell has been designated as an outlier. Automatic outlier detection can be informative, but a close inspection of QC metrics and tailored filtering for the specifics of the dataset at hand is strongly recommended. table(umi$outlier) ## ## FALSE TRUE ## 791 73 Then, we can use a PCA plot to see a 2D representation of the cells ordered by their quality metrics. plotReducedDim( umi, use_dimred = &quot;PCA_coldata&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;use&quot;, colour_by = &quot;outlier&quot; ) 6.2.5 Compare filterings Exercise 5 Compare the default, automatic and manual cell filters. Plot a Venn diagram of the outlier cells from these filterings. Hint: Use vennCounts and vennDiagram functions from the limma package to make a Venn diagram. Answer library(limma) auto &lt;- colnames(umi)[umi$outlier] man &lt;- colnames(umi)[!umi$use] venn.diag &lt;- vennCounts( cbind(colnames(umi) %in% auto, colnames(umi) %in% man) ) vennDiagram( venn.diag, names = c(&quot;Automatic&quot;, &quot;Manual&quot;), circle.col = c(&quot;blue&quot;, &quot;green&quot;) ) Figure 6.5: Comparison of the default, automatic and manual cell filters 6.3 Doublet detection For droplet-based datasets, there is chance that multiple cells are enclosed in one droplet resulting one cell barcode actually containing read information from multiple cells. One way to find doublets/multiplets in the data is to see if there are cells co-expressing markers of distinct cell types. There are also computational tools available for detecting potential doublets in the cells. A lot of these tools rely on artificial doublets formed from the datasets by randomly joining the expression profiles of two cells. Then the cells are tested against the artificial doublet profiles. We demonstrate the usage of two of these doublet detection tools. 6.3.1 scds scds(Bais and Kostka 2019) has two detection methods: co-expression based; binary-classification based. In co-expression based approach, the gene-pairs’ co-expression probablities are estimated based on a binomial model and gene pairs that do not co-expression often get higher scores when they co-expression in some cells. The cells’ doublet scores are derived based on the co-expression of pairs of genes. In the binary classification based approach, artificial doublet clusters are generated and cells are difficult to separate from the artificial doublets get higher doublet scores. library(scds) #- Annotate doublet using co-expression based doublet scoring: umi = cxds(umi) #- Annotate doublet using binary classification based doublet scoring: umi = bcds(umi) ## [1] train-error:0.058012+0.009028 test-error:0.082759+0.011116 ## Multiple eval metrics are present. Will use test_error for early stopping. ## Will train until test_error hasn&#39;t improved in 2 rounds. ## ## [2] train-error:0.043255+0.006356 test-error:0.076937+0.011995 ## [3] train-error:0.033853+0.003265 test-error:0.064193+0.016507 ## [4] train-error:0.029800+0.006149 test-error:0.059020+0.009223 ## [5] train-error:0.024594+0.002186 test-error:0.053783+0.016495 ## [6] train-error:0.021410+0.002716 test-error:0.051489+0.011570 ## [7] train-error:0.017650+0.001547 test-error:0.052659+0.008845 ## [8] train-error:0.016202+0.002439 test-error:0.043973+0.006895 ## [9] train-error:0.014032+0.002261 test-error:0.046287+0.012793 ## [10] train-error:0.011573+0.001440 test-error:0.042244+0.010619 ## [11] train-error:0.009404+0.001444 test-error:0.039930+0.010749 ## [12] train-error:0.009837+0.001076 test-error:0.039930+0.010904 ## [13] train-error:0.008535+0.001904 test-error:0.038771+0.010294 ## [14] train-error:0.007523+0.001079 test-error:0.038775+0.010791 ## [15] train-error:0.006655+0.001675 test-error:0.036457+0.008707 ## [16] train-error:0.005497+0.000577 test-error:0.037616+0.006611 ## [17] train-error:0.004919+0.000542 test-error:0.037032+0.006923 ## Stopping. Best iteration: ## [15] train-error:0.006655+0.001675 test-error:0.036457+0.008707 ## ## [1] train-error:0.054977 ## Will train until train_error hasn&#39;t improved in 2 rounds. ## ## [2] train-error:0.039931 ## [3] train-error:0.028356 ## [4] train-error:0.029514 ## [5] train-error:0.023148 ## [6] train-error:0.018519 ## [7] train-error:0.018519 ## [8] train-error:0.015625 #- Combine both annotations into a hybrid annotation umi = cxds_bcds_hybrid(umi) #- Doublet scores are now available via colData: CD = colData(umi) head(cbind(CD$cxds_score,CD$bcds_score, CD$hybrid_score)) ## [,1] [,2] [,3] ## NA19098.r1.A01 4131.405 0.020812672 0.2531013 ## NA19098.r1.A02 4564.089 0.010029603 0.2674993 ## NA19098.r1.A03 2827.904 0.005551378 0.1611125 ## NA19098.r1.A04 4708.213 0.005341432 0.2711787 ## NA19098.r1.A05 6134.590 0.005762757 0.3552646 ## NA19098.r1.A06 5810.730 0.010443962 0.3410366 plotColData( umi, x = &quot;total_features_by_counts&quot;, y = &quot;pct_counts_ERCC&quot;, colour = &quot;hybrid_score&quot; ) The scds paper features excellent descriptions and evaluations of other currently-available doublet detection methods. 6.3.2 DoubletDetection DoubletDetection is a python module that runs on raw UMI counts data. It generates artificial doublets and then perform cell clustering using the augmented dataset. Cells cluster closely to the artificial doublets across multiple iterations are predicted to be doublets. We provided the python scripts for running DoubletDetection on Tung datasets at ./mig_2019_scrnaseq-workshop/course_files/utils/run_doubletDetection.py python run_doubletDetection.py Here is the prediction results by DoubletDetection: require(UpSetR) ## Loading required package: UpSetR pred_tung &lt;- read.delim(file = &quot;data/doublets/tung.dbls.txt&quot;, header = FALSE) dim(pred_tung) ## [1] 864 1 dim(anno) ## [1] 864 5 umi$dbd_dbl &lt;- factor(pred_tung$V1) qc_label &lt;- read.delim(file = &quot;data/qc_ipsc.txt&quot;) head(qc_label) ## individual replicate well cell_number concentration tra1.60 ## 1 NA19098 r1 A01 1 1.734785 1 ## 2 NA19098 r1 A02 1 1.723038 1 ## 3 NA19098 r1 A03 1 1.512786 1 ## 4 NA19098 r1 A04 1 1.347492 1 ## 5 NA19098 r1 A05 1 2.313047 1 ## 6 NA19098 r1 A06 1 2.056803 1 qc_label$sample_id &lt;- paste0(qc_label$individual,&quot;.&quot;,qc_label$replicate,&quot;.&quot;,qc_label$well) rownames(qc_label) &lt;- qc_label$sample_id umi$cell_number &lt;- as.character(qc_label[umi$sample_id,&quot;cell_number&quot;]) umi$cell_number[qc_label$cell_number==0] &lt;- &quot;no_cell&quot; umi$cell_number[qc_label$cell_number == 1] &lt;- &quot;single_cell&quot; umi$cell_number[qc_label$cell_number&gt;1] &lt;- &quot;multi_cell&quot; multiplot(plotColData( umi, x = &quot;total_features_by_counts&quot;, y = &quot;pct_counts_ERCC&quot;, colour = &quot;hybrid_score&quot; ), plotColData( umi, x = &quot;total_features_by_counts&quot;, y = &quot;pct_counts_ERCC&quot;, colour = &quot;dbd_dbl&quot; ), plotColData( umi, x = &quot;total_features_by_counts&quot;, y = &quot;pct_counts_ERCC&quot;, colour = &quot;cell_number&quot; ),cols =2) doublets &lt;- unique(umi$sample_id[umi$dbd_dbl ==&quot;1&quot;], umi$sample_id[umi$hybrid_score &gt; 0.8]) pl_list &lt;- UpSetR::fromList(list(pred = doublets,qc_label = qc_label$sample_id[qc_label$cell_number &gt;1])) UpSetR::upset(pl_list,sets = c(&quot;pred&quot;,&quot;qc_label&quot;)) 6.3.2.1 Other tools available: DoubletFinder DoubletCells as part of SimpleSingleCell Scrublet 6.4 Gene QC 6.4.1 Gene expression In addition to removing cells with poor quality, it is usually a good idea to exclude genes where we suspect that technical artefacts may have skewed the results. Moreover, inspection of the gene expression profiles may provide insights about how the experimental procedures could be improved. It is often instructive to consider the number of reads consumed by the top 50 expressed genes. plotHighestExprs(umi, exprs_values = &quot;counts&quot;) Figure 6.6: Number of total counts consumed by the top 50 expressed genes The distributions are relatively flat indicating (but not guaranteeing!) good coverage of the full transcriptome of these cells. However, there are several spike-ins in the top 15 genes which suggests a greater dilution of the spike-ins may be preferrable if the experiment is to be repeated. 6.4.2 Gene filtering It is typically a good idea to remove genes whose expression level is considered “undetectable”. We define a gene as detectable if at least two cells contain more than 1 transcript from the gene. If we were considering read counts rather than UMI counts a reasonable threshold is to require at least five reads in at least two cells. However, in both cases the threshold strongly depends on the sequencing depth. It is important to keep in mind that genes must be filtered after cell filtering since some genes may only be detected in poor quality cells (note colData(umi)$use filter applied to the umi dataset). keep_feature &lt;- nexprs( umi[,colData(umi)$use], byrow = TRUE, detection_limit = 1 ) &gt;= 2 rowData(umi)$use &lt;- keep_feature table(keep_feature) ## keep_feature ## FALSE TRUE ## 4660 14066 Depending on the cell-type, protocol and sequencing depth, other cut-offs may be appropriate. 6.4.3 Save the data Dimensions of the QCed dataset (do not forget about the gene filter we defined above): dim(umi[rowData(umi)$use, colData(umi)$use]) ## [1] 14066 657 Let’s create an additional slot with log-transformed counts (we will need it in the next chapters) and remove saved PCA results from the reducedDim slot: assay(umi, &quot;logcounts_raw&quot;) &lt;- log2(counts(umi) + 1) reducedDim(umi) &lt;- NULL Save the data: saveRDS(umi, file = &quot;data/tung/umi.rds&quot;) 6.4.4 Big Exercise Perform exactly the same QC analysis with read counts of the same Blischak data. Use tung/reads.txt file to load the reads. Once you have finished please compare your results to ours (next chapter). 6.4.5 sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] UpSetR_1.4.0 scds_1.0.0 ## [3] limma_3.40.6 scater_1.12.2 ## [5] ggplot2_3.2.1 SingleCellExperiment_1.6.0 ## [7] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 ## [9] BiocParallel_1.18.1 matrixStats_0.55.0 ## [11] Biobase_2.44.0 GenomicRanges_1.36.1 ## [13] GenomeInfoDb_1.20.0 IRanges_2.18.3 ## [15] S4Vectors_0.22.1 BiocGenerics_0.30.0 ## ## loaded via a namespace (and not attached): ## [1] ggbeeswarm_0.6.0 colorspace_1.4-1 ## [3] mvoutlier_2.0.9 class_7.3-15 ## [5] modeltools_0.2-22 rio_0.5.16 ## [7] mclust_5.4.5 XVector_0.24.0 ## [9] pls_2.7-1 BiocNeighbors_1.2.0 ## [11] cvTools_0.3.2 flexmix_2.3-15 ## [13] mvtnorm_1.0-11 ranger_0.11.2 ## [15] splines_3.6.0 sROC_0.1-2 ## [17] robustbase_0.93-5 knitr_1.25 ## [19] zeallot_0.1.0 robCompositions_2.1.0 ## [21] kernlab_0.9-27 cluster_2.1.0 ## [23] rrcov_1.4-7 compiler_3.6.0 ## [25] backports_1.1.4 assertthat_0.2.1 ## [27] Matrix_1.2-17 lazyeval_0.2.2 ## [29] BiocSingular_1.0.0 htmltools_0.3.6 ## [31] tools_3.6.0 rsvd_1.0.2 ## [33] gtable_0.3.0 glue_1.3.1 ## [35] GenomeInfoDbData_1.2.1 dplyr_0.8.3 ## [37] Rcpp_1.0.2 carData_3.0-2 ## [39] cellranger_1.1.0 zCompositions_1.3.2-1 ## [41] vctrs_0.2.0 sgeostat_1.0-27 ## [43] fpc_2.2-3 DelayedMatrixStats_1.6.1 ## [45] lmtest_0.9-37 xfun_0.9 ## [47] laeken_0.5.0 stringr_1.4.0 ## [49] openxlsx_4.1.0.1 lifecycle_0.1.0 ## [51] irlba_2.3.3 DEoptimR_1.0-8 ## [53] zlibbioc_1.30.0 MASS_7.3-51.1 ## [55] zoo_1.8-6 scales_1.0.0 ## [57] VIM_4.8.0 hms_0.5.1 ## [59] RColorBrewer_1.1-2 yaml_2.2.0 ## [61] curl_4.2 NADA_1.6-1 ## [63] gridExtra_2.3 reshape_0.8.8 ## [65] stringi_1.4.3 highr_0.8 ## [67] pcaPP_1.9-73 e1071_1.7-2 ## [69] boot_1.3-20 zip_2.0.4 ## [71] truncnorm_1.0-8 rlang_0.4.0 ## [73] pkgconfig_2.0.3 prabclus_2.3-1 ## [75] bitops_1.0-6 evaluate_0.14 ## [77] lattice_0.20-38 purrr_0.3.2 ## [79] labeling_0.3 cowplot_1.0.0 ## [81] tidyselect_0.2.5 GGally_1.4.0 ## [83] plyr_1.8.4 magrittr_1.5 ## [85] bookdown_0.13 R6_2.4.0 ## [87] pillar_1.4.2 haven_2.1.1 ## [89] foreign_0.8-70 withr_2.1.2 ## [91] survival_2.43-3 abind_1.4-5 ## [93] RCurl_1.95-4.12 sp_1.3-1 ## [95] nnet_7.3-12 tibble_2.1.3 ## [97] crayon_1.3.4 car_3.0-3 ## [99] xgboost_0.90.0.2 rmarkdown_1.15 ## [101] viridis_0.5.1 grid_3.6.0 ## [103] readxl_1.3.1 data.table_1.12.2 ## [105] forcats_0.4.0 diptest_0.75-7 ## [107] vcd_1.4-4 digest_0.6.21 ## [109] tidyr_1.0.0 munsell_0.5.0 ## [111] beeswarm_0.2.3 viridisLite_0.3.0 ## [113] vipor_0.4.5 6.5 Exercise: Expression QC (Reads) library(SingleCellExperiment) library(scater) options(stringsAsFactors = FALSE) reads &lt;- read.table(&quot;data/tung/reads.txt&quot;, sep = &quot;\\t&quot;) anno &lt;- read.table(&quot;data/tung/annotation.txt&quot;, sep = &quot;\\t&quot;, header = TRUE) head(reads[ , 1:3]) ## NA19098.r1.A01 NA19098.r1.A02 NA19098.r1.A03 ## ENSG00000237683 0 0 0 ## ENSG00000187634 0 0 0 ## ENSG00000188976 57 140 1 ## ENSG00000187961 0 0 0 ## ENSG00000187583 0 0 0 ## ENSG00000187642 0 0 0 head(anno) ## individual replicate well batch sample_id ## 1 NA19098 r1 A01 NA19098.r1 NA19098.r1.A01 ## 2 NA19098 r1 A02 NA19098.r1 NA19098.r1.A02 ## 3 NA19098 r1 A03 NA19098.r1 NA19098.r1.A03 ## 4 NA19098 r1 A04 NA19098.r1 NA19098.r1.A04 ## 5 NA19098 r1 A05 NA19098.r1 NA19098.r1.A05 ## 6 NA19098 r1 A06 NA19098.r1 NA19098.r1.A06 reads &lt;- SingleCellExperiment( assays = list(counts = as.matrix(reads)), colData = anno ) keep_feature &lt;- rowSums(counts(reads) &gt; 0) &gt; 0 reads &lt;- reads[keep_feature, ] isSpike(reads, &quot;ERCC&quot;) &lt;- grepl(&quot;^ERCC-&quot;, rownames(reads)) isSpike(reads, &quot;MT&quot;) &lt;- rownames(reads) %in% c(&quot;ENSG00000198899&quot;, &quot;ENSG00000198727&quot;, &quot;ENSG00000198888&quot;, &quot;ENSG00000198886&quot;, &quot;ENSG00000212907&quot;, &quot;ENSG00000198786&quot;, &quot;ENSG00000198695&quot;, &quot;ENSG00000198712&quot;, &quot;ENSG00000198804&quot;, &quot;ENSG00000198763&quot;, &quot;ENSG00000228253&quot;, &quot;ENSG00000198938&quot;, &quot;ENSG00000198840&quot;) reads &lt;- calculateQCMetrics( reads, feature_controls = list( ERCC = isSpike(reads, &quot;ERCC&quot;), MT = isSpike(reads, &quot;MT&quot;) ) ) ## Warning in calculateQCMetrics(reads, feature_controls = list(ERCC = ## isSpike(reads, : spike-in set &#39;ERCC&#39; overwritten by feature_controls set of ## the same name hist( reads$total_counts, breaks = 100 ) abline(v = 1.3e6, col = &quot;red&quot;) Figure 6.7: Histogram of library sizes for all cells filter_by_total_counts &lt;- (reads$total_counts &gt; 1.3e6) table(filter_by_total_counts) ## filter_by_total_counts ## FALSE TRUE ## 180 684 hist( reads$total_features_by_counts, breaks = 100 ) abline(v = 7000, col = &quot;red&quot;) Figure 6.8: Histogram of the number of detected genes in all cells filter_by_expr_features &lt;- (reads$total_features_by_counts &gt; 7000) table(filter_by_expr_features) ## filter_by_expr_features ## FALSE TRUE ## 116 748 plotColData( reads, x = &quot;total_features_by_counts&quot;, y = &quot;pct_counts_MT&quot;, colour = &quot;batch&quot; ) Figure 6.9: Percentage of counts in MT genes plotColData( reads, x = &quot;total_features_by_counts&quot;, y = &quot;pct_counts_ERCC&quot;, colour = &quot;batch&quot; ) Figure 6.10: Percentage of counts in ERCCs filter_by_ERCC &lt;- reads$batch != &quot;NA19098.r2&quot; &amp; reads$pct_counts_ERCC &lt; 25 table(filter_by_ERCC) ## filter_by_ERCC ## FALSE TRUE ## 103 761 filter_by_MT &lt;- reads$pct_counts_MT &lt; 30 table(filter_by_MT) ## filter_by_MT ## FALSE TRUE ## 18 846 reads$use &lt;- ( # sufficient features (genes) filter_by_expr_features &amp; # sufficient molecules counted filter_by_total_counts &amp; # sufficient endogenous RNA filter_by_ERCC &amp; # remove cells with unusual number of reads in MT genes filter_by_MT ) table(reads$use) ## ## FALSE TRUE ## 258 606 reads &lt;- runPCA( reads, use_coldata = TRUE, detect_outliers = TRUE ) reducedDimNames(reads) ## [1] &quot;PCA_coldata&quot; table(reads$outlier) ## ## FALSE TRUE ## 753 111 plotReducedDim( reads, use_dimred = &quot;PCA_coldata&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;use&quot;, colour_by = &quot;outlier&quot; ) library(limma) ## ## Attaching package: &#39;limma&#39; ## The following object is masked from &#39;package:scater&#39;: ## ## plotMDS ## The following object is masked from &#39;package:BiocGenerics&#39;: ## ## plotMA auto &lt;- colnames(reads)[reads$outlier] man &lt;- colnames(reads)[!reads$use] venn.diag &lt;- vennCounts( cbind(colnames(reads) %in% auto, colnames(reads) %in% man) ) vennDiagram( venn.diag, names = c(&quot;Automatic&quot;, &quot;Manual&quot;), circle.col = c(&quot;blue&quot;, &quot;green&quot;) ) Figure 6.11: Comparison of the default, automatic and manual cell filters plotHighestExprs(reads, exprs_values = &quot;counts&quot;) Figure 6.12: Number of total counts consumed by the top 50 expressed genes keep_feature &lt;- nexprs( reads[,colData(reads)$use], byrow = TRUE, detection_limit = 1 ) &gt;= 2 rowData(reads)$use &lt;- keep_feature table(keep_feature) ## keep_feature ## FALSE TRUE ## 2664 16062 dim(reads[rowData(reads)$use, colData(reads)$use]) ## [1] 16062 606 assay(reads, &quot;logcounts_raw&quot;) &lt;- log2(counts(reads) + 1) reducedDim(reads) &lt;- NULL saveRDS(reads, file = &quot;data/tung/reads.rds&quot;) By comparing Figure 6.5 and Figure 6.11, it is clear that the reads based filtering removed more cells than the UMI based analysis. If you go back and compare the results you should be able to conclude that the ERCC and MT filters are more strict for the reads-based analysis. sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] limma_3.40.6 scater_1.12.2 ## [3] ggplot2_3.2.1 SingleCellExperiment_1.6.0 ## [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 ## [7] BiocParallel_1.18.1 matrixStats_0.55.0 ## [9] Biobase_2.44.0 GenomicRanges_1.36.1 ## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 ## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 ## ## loaded via a namespace (and not attached): ## [1] ggbeeswarm_0.6.0 colorspace_1.4-1 ## [3] mvoutlier_2.0.9 class_7.3-15 ## [5] modeltools_0.2-22 rio_0.5.16 ## [7] mclust_5.4.5 XVector_0.24.0 ## [9] pls_2.7-1 BiocNeighbors_1.2.0 ## [11] cvTools_0.3.2 flexmix_2.3-15 ## [13] mvtnorm_1.0-11 ranger_0.11.2 ## [15] splines_3.6.0 sROC_0.1-2 ## [17] robustbase_0.93-5 knitr_1.25 ## [19] zeallot_0.1.0 robCompositions_2.1.0 ## [21] kernlab_0.9-27 cluster_2.1.0 ## [23] rrcov_1.4-7 compiler_3.6.0 ## [25] backports_1.1.4 assertthat_0.2.1 ## [27] Matrix_1.2-17 lazyeval_0.2.2 ## [29] BiocSingular_1.0.0 htmltools_0.3.6 ## [31] tools_3.6.0 rsvd_1.0.2 ## [33] gtable_0.3.0 glue_1.3.1 ## [35] GenomeInfoDbData_1.2.1 dplyr_0.8.3 ## [37] Rcpp_1.0.2 carData_3.0-2 ## [39] cellranger_1.1.0 zCompositions_1.3.2-1 ## [41] vctrs_0.2.0 sgeostat_1.0-27 ## [43] fpc_2.2-3 DelayedMatrixStats_1.6.1 ## [45] lmtest_0.9-37 xfun_0.9 ## [47] laeken_0.5.0 stringr_1.4.0 ## [49] openxlsx_4.1.0.1 lifecycle_0.1.0 ## [51] irlba_2.3.3 DEoptimR_1.0-8 ## [53] zlibbioc_1.30.0 MASS_7.3-51.1 ## [55] zoo_1.8-6 scales_1.0.0 ## [57] VIM_4.8.0 hms_0.5.1 ## [59] RColorBrewer_1.1-2 yaml_2.2.0 ## [61] curl_4.2 NADA_1.6-1 ## [63] gridExtra_2.3 reshape_0.8.8 ## [65] stringi_1.4.3 highr_0.8 ## [67] pcaPP_1.9-73 e1071_1.7-2 ## [69] boot_1.3-20 zip_2.0.4 ## [71] truncnorm_1.0-8 rlang_0.4.0 ## [73] pkgconfig_2.0.3 prabclus_2.3-1 ## [75] bitops_1.0-6 evaluate_0.14 ## [77] lattice_0.20-38 purrr_0.3.2 ## [79] labeling_0.3 cowplot_1.0.0 ## [81] tidyselect_0.2.5 GGally_1.4.0 ## [83] plyr_1.8.4 magrittr_1.5 ## [85] bookdown_0.13 R6_2.4.0 ## [87] pillar_1.4.2 haven_2.1.1 ## [89] foreign_0.8-70 withr_2.1.2 ## [91] survival_2.43-3 abind_1.4-5 ## [93] RCurl_1.95-4.12 sp_1.3-1 ## [95] nnet_7.3-12 tibble_2.1.3 ## [97] crayon_1.3.4 car_3.0-3 ## [99] rmarkdown_1.15 viridis_0.5.1 ## [101] grid_3.6.0 readxl_1.3.1 ## [103] data.table_1.12.2 forcats_0.4.0 ## [105] diptest_0.75-7 vcd_1.4-4 ## [107] digest_0.6.21 tidyr_1.0.0 ## [109] munsell_0.5.0 beeswarm_0.2.3 ## [111] viridisLite_0.3.0 vipor_0.4.5 6.6 Data visualization and exploratory data analysis 6.6.1 Introduction In this chapter we will continue to work with the filtered Tung dataset produced in the previous chapter. We will explore different ways of visualizing the data to allow you to asses what happened to the expression matrix after the quality control step. scater package provides several very useful functions to simplify visualisation. One important aspect of single-cell RNA-seq is to control for batch effects. Batch effects are technical artefacts that are added to the samples during handling. For example, if two sets of samples were prepared in different labs or even on different days in the same lab, then we may observe greater similarities between the samples that were handled together. In the worst case scenario, batch effects may be mistaken for true biological variation. Data visualisation can help to identify batch effects or other unwanted sources of variation that affect our observed gene expression measurements. The Tung data allows us to explore these issues in a controlled manner since some of the salient aspects of how the samples were handled have been recorded. Ideally, we expect to see batches from the same individual grouping together and distinct groups corresponding to each individual. Data visualisation and exploratory data analysis are invaluable for allowing us to get a “feel” for a dataset. This is an area of data analysis that is perhaps more art than science, but is a crucial aspect of single-cell QC and analysis. library(SingleCellExperiment) library(scater) options(stringsAsFactors = FALSE) umi &lt;- readRDS(&quot;data/tung/umi.rds&quot;) umi.qc &lt;- umi[rowData(umi)$use, colData(umi)$use] endog_genes &lt;- !rowData(umi.qc)$is_feature_control 6.6.2 PCA plot The easiest way to overview the data is by transforming it using the principal component analysis and then visualize the first two principal components. Principal component analysis (PCA) is a statistical procedure that uses a transformation to convert a set of observations into a set of values of linearly uncorrelated variables called principal components (PCs). The number of principal components is less than or equal to the number of original variables. Mathematically, the PCs correspond to the eigenvectors of the covariance matrix. The eigenvectors are sorted by eigenvalue so that the first principal component accounts for as much of the variability in the data as possible, and each succeeding component in turn has the highest variance possible under the constraint that it is orthogonal to the preceding components (the figure below is taken from here). Figure 6.13: Schematic representation of PCA dimensionality reduction 6.6.2.1 Before QC Without log-transformation: tmp &lt;- runPCA( umi[endog_genes, ], exprs_values = &quot;counts&quot; ) plotPCA( tmp, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) Figure 6.14: PCA plot of the tung data With log-transformation: tmp &lt;- runPCA( umi[endog_genes, ], exprs_values = &quot;logcounts_raw&quot; ) plotPCA( tmp, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) Figure 6.15: PCA plot of the tung data Clearly log-transformation is benefitial for our data - it reduces the variance on the first principal component and already separates some biological effects. Moreover, it makes the distribution of the expression values more normal. In the following analysis and chapters we will be using log-transformed raw counts by default. However, note that just a log-transformation is not enough to account for different technical factors between the cells (e.g. sequencing depth). Therefore, please do not use logcounts_raw for your downstream analysis, instead as a minimum suitable data use the logcounts slot of the SingleCellExperiment object, which not just log-transformed, but also normalised by library size (e.g. CPM normalisation). In the course we use logcounts_raw only for demonstration purposes! 6.6.2.2 After QC tmp &lt;- runPCA( umi.qc[endog_genes, ], exprs_values = &quot;logcounts_raw&quot; ) plotPCA( tmp, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) Figure 6.16: PCA plot of the tung data Comparing Figure 6.15 and Figure 6.16, it is clear that after quality control the NA19098.r2 cells no longer form a group of outliers. By default only the top 500 most variable genes are used by scater to calculate the PCA. This can be adjusted by changing the ntop argument. Exercise 1 How do the PCA plots change if when all 14,066 genes are used? Or when only top 50 genes are used? Why does the fraction of variance accounted for by the first PC change so dramatically? Hint Use ntop argument of the plotPCA function. Our answer Figure 6.17: PCA plot of the tung data (14214 genes) Figure 6.18: PCA plot of the tung data (50 genes) If your answers are different please compare your code with ours (you need to search for this exercise in the opened file). 6.6.3 tSNE map An alternative to PCA for visualizing scRNA-seq data is a tSNE plot. tSNE (t-Distributed Stochastic Neighbor Embedding) converts high-dimensional Euclidean distances between datapoints into conditional probabilities that represent similarities, to produce a low-dimensional representation of high-dimensional data that displays large- and local-scale structure in the dataset. Here, we map high dimensional data ( i.e. our 14,214 dimensional expression matrix) to a 2-dimensional space while preserving local distances between cells. tSNE is almost always used to produce a two-dimensional representation of a high-dimensional dataset; it is only rarely used to generate a reduced-dimension space with more than two dimensions and is typically used only for visulisation as opposed being used as a general dimension-reduction method. Due to the non-linear and stochastic nature of the algorithm, tSNE is more difficult to intuitively interpret than a standard dimensionality reduction method such as PCA. Things to be aware of when using tSNE: tSNE has a tendency to (visually) cluster points; as such, it often creates attractive plots of datasets with distinct cell types, but does look as good when there are continuous changes in the cell population. The hyperparameters really matter: in particular, changing the perplexity parameter can have a large effect on the visulisation produced. Perplexity is a measure of information, but can loosely be thought of as a tuning parameter that controls the number of nearest neighbous for each datapoint. Cluster sizes in a tSNE plot mean nothing. Distances between clusters might not mean anything. Random noise doesn’t always look random. You can see some shapes, sometimes. For more details about how to use tSNE effectively, see this exellent article. In contrast with PCA, tSNE is a stochastic algorithm which means running the method multiple times on the same dataset will result in different plots. To ensure reproducibility, we fix the “seed” of the random-number generator in the code below so that we always get the same plot. 6.6.3.1 Before QC set.seed(123456) tmp &lt;- runTSNE( umi[endog_genes, ], exprs_values = &quot;logcounts_raw&quot;, perplexity = 130 ) plotTSNE( tmp, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) Figure 6.19: tSNE map of the tung data 6.6.3.2 After QC set.seed(123456) tmp &lt;- runTSNE( umi.qc[endog_genes, ], exprs_values = &quot;logcounts_raw&quot;, perplexity = 130 ) plotTSNE( tmp, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) Figure 6.20: tSNE map of the tung data Interpreting PCA and tSNE plots is often challenging and due to their stochastic and non-linear nature, they are less intuitive. However, in this case it is clear that they provide a similar picture of the data. Comparing Figure 6.19 and 6.20, it is again clear that the samples from NA19098.r2 are no longer outliers after the QC filtering. Furthermore tSNE requires you to provide a value of perplexity which reflects the number of neighbours used to build the nearest-neighbour network; a high value creates a dense network which clumps cells together while a low value makes the network more sparse allowing groups of cells to separate from each other. scater uses a default perplexity of the total number of cells divided by five (rounded down). You can read more about the pitfalls of using tSNE here. UMAP (Uniform Manifold Approximation and Projection) is a newer alternative to tSNE which also often creates attractive visualisations of scRNA-seq data with the benefit of being faster than tSNE to compute and is a “true” dimensionality reduction method. We will look at PCA, tSNE and UMAP plots in subsequent chapters and discuss the topic of dimensionality reduction further in the Latent spaces chapter. Exercise 2 How do the tSNE plots change when a perplexity of 10 or 200 is used? How does the choice of perplexity affect the interpretation of the results? Our answer Figure 6.21: tSNE map of the tung data (perplexity = 10) Figure 6.22: tSNE map of the tung data (perplexity = 200) 6.6.4 Big Exercise Perform the same analysis with read counts of the Blischak data. Use tung/reads.rds file to load the reads SCE object. Once you have finished please compare your results to ours (next chapter). 6.6.5 sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] scater_1.12.2 ggplot2_3.2.1 ## [3] SingleCellExperiment_1.6.0 SummarizedExperiment_1.14.1 ## [5] DelayedArray_0.10.0 BiocParallel_1.18.1 ## [7] matrixStats_0.55.0 Biobase_2.44.0 ## [9] GenomicRanges_1.36.1 GenomeInfoDb_1.20.0 ## [11] IRanges_2.18.3 S4Vectors_0.22.1 ## [13] BiocGenerics_0.30.0 ## ## loaded via a namespace (and not attached): ## [1] Rcpp_1.0.2 rsvd_1.0.2 ## [3] lattice_0.20-38 assertthat_0.2.1 ## [5] digest_0.6.21 R6_2.4.0 ## [7] evaluate_0.14 highr_0.8 ## [9] pillar_1.4.2 zlibbioc_1.30.0 ## [11] rlang_0.4.0 lazyeval_0.2.2 ## [13] irlba_2.3.3 Matrix_1.2-17 ## [15] rmarkdown_1.15 BiocNeighbors_1.2.0 ## [17] labeling_0.3 Rtsne_0.15 ## [19] stringr_1.4.0 RCurl_1.95-4.12 ## [21] munsell_0.5.0 compiler_3.6.0 ## [23] vipor_0.4.5 BiocSingular_1.0.0 ## [25] xfun_0.9 pkgconfig_2.0.3 ## [27] ggbeeswarm_0.6.0 htmltools_0.3.6 ## [29] tidyselect_0.2.5 tibble_2.1.3 ## [31] gridExtra_2.3 GenomeInfoDbData_1.2.1 ## [33] bookdown_0.13 viridisLite_0.3.0 ## [35] crayon_1.3.4 dplyr_0.8.3 ## [37] withr_2.1.2 bitops_1.0-6 ## [39] grid_3.6.0 gtable_0.3.0 ## [41] magrittr_1.5 scales_1.0.0 ## [43] stringi_1.4.3 XVector_0.24.0 ## [45] viridis_0.5.1 DelayedMatrixStats_1.6.1 ## [47] cowplot_1.0.0 tools_3.6.0 ## [49] glue_1.3.1 beeswarm_0.2.3 ## [51] purrr_0.3.2 yaml_2.2.0 ## [53] colorspace_1.4-1 knitr_1.25 6.7 Exercise: Data visualization (Reads) library(scater) options(stringsAsFactors = FALSE) reads &lt;- readRDS(&quot;data/tung/reads.rds&quot;) reads.qc &lt;- reads[rowData(reads)$use, colData(reads)$use] endog_genes &lt;- !rowData(reads.qc)$is_feature_control tmp &lt;- runPCA( reads[endog_genes, ], exprs_values = &quot;counts&quot; ) plotPCA( tmp, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) Figure 6.23: PCA plot of the tung data tmp &lt;- runPCA( reads[endog_genes, ], exprs_values = &quot;logcounts_raw&quot; ) plotPCA( tmp, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) Figure 6.24: PCA plot of the tung data tmp &lt;- runPCA( reads.qc[endog_genes, ], exprs_values = &quot;logcounts_raw&quot; ) plotPCA( tmp, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) Figure 6.25: PCA plot of the tung data set.seed(123456) tmp &lt;- runTSNE( reads[endog_genes, ], exprs_values = &quot;logcounts_raw&quot;, perplexity = 130 ) plotTSNE( tmp, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) Figure 6.26: tSNE map of the tung data set.seed(123456) tmp &lt;- runTSNE( reads.qc[endog_genes, ], exprs_values = &quot;logcounts_raw&quot;, perplexity = 130 ) plotTSNE( tmp, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) Figure 6.27: tSNE map of the tung data Figure 6.21: tSNE map of the tung data (perplexity = 10) Figure 6.22: tSNE map of the tung data (perplexity = 200) sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] scater_1.12.2 ggplot2_3.2.1 ## [3] SingleCellExperiment_1.6.0 SummarizedExperiment_1.14.1 ## [5] DelayedArray_0.10.0 BiocParallel_1.18.1 ## [7] matrixStats_0.55.0 Biobase_2.44.0 ## [9] GenomicRanges_1.36.1 GenomeInfoDb_1.20.0 ## [11] IRanges_2.18.3 S4Vectors_0.22.1 ## [13] BiocGenerics_0.30.0 ## ## loaded via a namespace (and not attached): ## [1] Rcpp_1.0.2 rsvd_1.0.2 ## [3] lattice_0.20-38 assertthat_0.2.1 ## [5] digest_0.6.21 R6_2.4.0 ## [7] evaluate_0.14 highr_0.8 ## [9] pillar_1.4.2 zlibbioc_1.30.0 ## [11] rlang_0.4.0 lazyeval_0.2.2 ## [13] irlba_2.3.3 Matrix_1.2-17 ## [15] rmarkdown_1.15 BiocNeighbors_1.2.0 ## [17] labeling_0.3 Rtsne_0.15 ## [19] stringr_1.4.0 RCurl_1.95-4.12 ## [21] munsell_0.5.0 compiler_3.6.0 ## [23] vipor_0.4.5 BiocSingular_1.0.0 ## [25] xfun_0.9 pkgconfig_2.0.3 ## [27] ggbeeswarm_0.6.0 htmltools_0.3.6 ## [29] tidyselect_0.2.5 tibble_2.1.3 ## [31] gridExtra_2.3 GenomeInfoDbData_1.2.1 ## [33] bookdown_0.13 viridisLite_0.3.0 ## [35] crayon_1.3.4 dplyr_0.8.3 ## [37] withr_2.1.2 bitops_1.0-6 ## [39] grid_3.6.0 gtable_0.3.0 ## [41] magrittr_1.5 scales_1.0.0 ## [43] stringi_1.4.3 XVector_0.24.0 ## [45] viridis_0.5.1 DelayedMatrixStats_1.6.1 ## [47] cowplot_1.0.0 tools_3.6.0 ## [49] glue_1.3.1 beeswarm_0.2.3 ## [51] purrr_0.3.2 yaml_2.2.0 ## [53] colorspace_1.4-1 knitr_1.25 References "],
+["quality-control-and-data-visualisation.html", "6 Quality control and data visualisation 6.1 Expression QC overview (UMI) 6.2 Cell QC 6.3 Doublet detection 6.4 Gene QC 6.5 Exercise: Expression QC (Reads) 6.6 Data visualization and exploratory data analysis 6.7 Exercise: Data visualization (Reads)", " 6 Quality control and data visualisation The principle of garbage in, garbage out is at least as strong in single-cell genomics as it is elsewere in science. Effective quality control (QC) is crucial to high-quality scRNA-seq data analysis. We discuss principles and strategies for QC in this chapter, along with some discussion and demonstration of data visualisation approaches. 6.1 Expression QC overview (UMI) 6.1.1 Introduction Once gene expression has been quantified it is summarized as an expression matrix where each row corresponds to a gene (or transcript) and each column corresponds to a single cell. This matrix should be examined to remove poor quality cells which were not detected in either read QC or mapping QC steps. Failure to remove low quality cells at this stage may add technical noise which has the potential to obscure the biological signals of interest in the downstream analysis. Since there is currently no standard method for performing scRNASeq the expected values for the various QC measures that will be presented here can vary substantially from experiment to experiment. Thus, to perform QC we will be looking for cells which are outliers with respect to the rest of the dataset rather than comparing to independent quality standards. Consequently, care should be taken when comparing quality metrics across datasets collected using different protocols. 6.1.2 Tung dataset To illustrate cell QC, we consider a dataset of induced pluripotent stem cells generated from three different individuals (Tung et al. 2017) in Yoav Gilad’s lab at the University of Chicago. The experiments were carried out on the Fluidigm C1 platform and to facilitate the quantification both unique molecular identifiers (UMIs) and ERCC spike-ins were used. The data files are located in the tung folder in your working directory. These files are the copies of the original files made on the 15/03/16. We will use these copies for reproducibility purposes. library(SingleCellExperiment) library(scater) options(stringsAsFactors = FALSE) Load the data and annotations: molecules &lt;- read.table(&quot;data/tung/molecules.txt&quot;, sep = &quot;\\t&quot;) anno &lt;- read.table(&quot;data/tung/annotation.txt&quot;, sep = &quot;\\t&quot;, header = TRUE) Inspect a small portion of the expression matrix head(molecules[ , 1:3]) ## NA19098.r1.A01 NA19098.r1.A02 NA19098.r1.A03 ## ENSG00000237683 0 0 0 ## ENSG00000187634 0 0 0 ## ENSG00000188976 3 6 1 ## ENSG00000187961 0 0 0 ## ENSG00000187583 0 0 0 ## ENSG00000187642 0 0 0 head(anno) ## individual replicate well batch sample_id ## 1 NA19098 r1 A01 NA19098.r1 NA19098.r1.A01 ## 2 NA19098 r1 A02 NA19098.r1 NA19098.r1.A02 ## 3 NA19098 r1 A03 NA19098.r1 NA19098.r1.A03 ## 4 NA19098 r1 A04 NA19098.r1 NA19098.r1.A04 ## 5 NA19098 r1 A05 NA19098.r1 NA19098.r1.A05 ## 6 NA19098 r1 A06 NA19098.r1 NA19098.r1.A06 The data consists of 3 individuals and r length(unique(anno$replicate)) replicates and therefore has r length(unique(anno$batch)) batches in total. We standardize the analysis by using both SingleCellExperiment (SCE) and scater packages. First, create the SCE object: umi &lt;- SingleCellExperiment( assays = list(counts = as.matrix(molecules)), colData = anno ) Remove genes that are not expressed in any cell: keep_feature &lt;- rowSums(counts(umi) &gt; 0) &gt; 0 umi &lt;- umi[keep_feature, ] Define control features (genes) - ERCC spike-ins and mitochondrial genes (provided by the authors): isSpike(umi, &quot;ERCC&quot;) &lt;- grepl(&quot;^ERCC-&quot;, rownames(umi)) isSpike(umi, &quot;MT&quot;) &lt;- rownames(umi) %in% c(&quot;ENSG00000198899&quot;, &quot;ENSG00000198727&quot;, &quot;ENSG00000198888&quot;, &quot;ENSG00000198886&quot;, &quot;ENSG00000212907&quot;, &quot;ENSG00000198786&quot;, &quot;ENSG00000198695&quot;, &quot;ENSG00000198712&quot;, &quot;ENSG00000198804&quot;, &quot;ENSG00000198763&quot;, &quot;ENSG00000228253&quot;, &quot;ENSG00000198938&quot;, &quot;ENSG00000198840&quot;) Calculate the quality metrics: umi &lt;- calculateQCMetrics( umi, feature_controls = list( ERCC = isSpike(umi, &quot;ERCC&quot;), MT = isSpike(umi, &quot;MT&quot;) ) ) ## Warning in calculateQCMetrics(umi, feature_controls = list(ERCC = ## isSpike(umi, : spike-in set &#39;ERCC&#39; overwritten by feature_controls set of ## the same name 6.2 Cell QC 6.2.1 Library size Next we consider the total number of RNA molecules detected per sample (if we were using read counts rather than UMI counts this would be the total number of reads). Wells with few reads/molecules are likely to have been broken or failed to capture a cell, and should thus be removed. hist( umi$total_counts, breaks = 100 ) abline(v = 25000, col = &quot;red&quot;) Figure 6.1: Histogram of library sizes for all cells Exercise 1 How many cells does our filter remove? What distribution do you expect that the total number of molecules for each cell should follow? Our answer ## filter_by_total_counts ## FALSE TRUE ## 46 818 6.2.2 Detected genes In addition to ensuring sufficient sequencing depth for each sample, we also want to make sure that the reads are distributed across the transcriptome. Thus, we count the total number of unique genes detected in each sample. hist( umi$total_features_by_counts, breaks = 100 ) abline(v = 7000, col = &quot;red&quot;) Figure 6.2: Histogram of the number of detected genes in all cells From the plot we conclude that most cells have between 7,000-10,000 detected genes, which is normal for high-depth scRNA-seq. However, this varies by experimental protocol and sequencing depth. For example, droplet-based methods or samples with lower sequencing-depth typically detect fewer genes per cell. The most notable feature in the above plot is the “heavy tail” on the left hand side of the distribution. If detection rates were equal across the cells then the distribution should be approximately normal. Thus we remove those cells in the tail of the distribution (fewer than 7,000 detected genes). Exercise 2 How many cells does our filter remove? Our answer ## filter_by_expr_features ## FALSE TRUE ## 116 748 6.2.3 ERCCs and MTs Another measure of cell quality is the ratio between ERCC spike-in RNAs and endogenous RNAs. This ratio can be used to estimate the total amount of RNA in the captured cells. Cells with a high level of spike-in RNAs had low starting amounts of RNA, likely due to the cell being dead or stressed which may result in the RNA being degraded. plotColData( umi, x = &quot;total_features_by_counts&quot;, y = &quot;pct_counts_MT&quot;, colour = &quot;batch&quot; ) Figure 6.3: Percentage of counts in MT genes plotColData( umi, x = &quot;total_features_by_counts&quot;, y = &quot;pct_counts_ERCC&quot;, colour = &quot;batch&quot; ) Figure 6.4: Percentage of counts in ERCCs The above analysis shows that majority of the cells from NA19098.r2 batch have a very high ERCC/Endo ratio. Indeed, it has been shown by the authors that this batch contains cells of smaller size. Exercise 3 Create filters for removing batch NA19098.r2 and cells with high expression of mitochondrial genes (&gt;10% of total counts in a cell). Our answer ## filter_by_ERCC ## FALSE TRUE ## 96 768 ## filter_by_MT ## FALSE TRUE ## 31 833 Exercise 4 What would you expect to see in the ERCC vs counts plot if you were examining a dataset containing cells of different sizes (eg. normal &amp; senescent cells)? Answer You would expect to see a group corresponding to the smaller cells (normal) with a higher fraction of ERCC reads than a separate group corresponding to the larger cells (senescent). 6.2.4 Cell filtering 6.2.4.1 Manual Now we can define a cell filter based on our previous analysis: umi$use &lt;- ( # sufficient features (genes) filter_by_expr_features &amp; # sufficient molecules counted filter_by_total_counts &amp; # sufficient endogenous RNA filter_by_ERCC &amp; # remove cells with unusual number of reads in MT genes filter_by_MT ) table(umi$use) ## ## FALSE TRUE ## 207 657 6.2.4.2 Automatic Another option available in scater is to conduct PCA on a set of QC metrics and then use automatic outlier detection to identify potentially problematic cells. By default, the following metrics are used for PCA-based outlier detection: pct_counts_top_100_features total_features pct_counts_feature_controls n_detected_feature_controls log10_counts_endogenous_features log10_counts_feature_controls scater first creates a matrix where the rows represent cells and the columns represent the different QC metrics. Then, outlier cells can also be identified by using the mvoutlier package on the QC metrics for all cells. This will identify cells that have substantially different QC metrics from the others, possibly corresponding to low-quality cells. We can visualize any outliers using a principal components plot as shown below: umi &lt;- runPCA( umi, use_coldata = TRUE, detect_outliers = TRUE ) reducedDimNames(umi) ## [1] &quot;PCA_coldata&quot; Column subsetting can then be performed based on the $outlier slot, which indicates whether or not each cell has been designated as an outlier. Automatic outlier detection can be informative, but a close inspection of QC metrics and tailored filtering for the specifics of the dataset at hand is strongly recommended. table(umi$outlier) ## ## FALSE TRUE ## 791 73 Then, we can use a PCA plot to see a 2D representation of the cells ordered by their quality metrics. plotReducedDim( umi, use_dimred = &quot;PCA_coldata&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;use&quot;, colour_by = &quot;outlier&quot; ) 6.2.5 Compare filterings Exercise 5 Compare the default, automatic and manual cell filters. Plot a Venn diagram of the outlier cells from these filterings. Hint: Use vennCounts and vennDiagram functions from the limma package to make a Venn diagram. Answer library(limma) auto &lt;- colnames(umi)[umi$outlier] man &lt;- colnames(umi)[!umi$use] venn.diag &lt;- vennCounts( cbind(colnames(umi) %in% auto, colnames(umi) %in% man) ) vennDiagram( venn.diag, names = c(&quot;Automatic&quot;, &quot;Manual&quot;), circle.col = c(&quot;blue&quot;, &quot;green&quot;) ) Figure 6.5: Comparison of the default, automatic and manual cell filters 6.3 Doublet detection For droplet-based datasets, there is chance that multiple cells are enclosed in one droplet resulting one cell barcode actually containing read information from multiple cells. One way to find doublets/multiplets in the data is to see if there are cells co-expressing markers of distinct cell types. There are also computational tools available for detecting potential doublets in the cells. A lot of these tools rely on artificial doublets formed from the datasets by randomly joining the expression profiles of two cells. Then the cells are tested against the artificial doublet profiles. We demonstrate the usage of two of these doublet detection tools. 6.3.1 scds scds(Bais and Kostka 2019) has two detection methods: co-expression based; binary-classification based. In co-expression based approach, the gene-pairs’ co-expression probablities are estimated based on a binomial model and gene pairs that do not co-expression often get higher scores when they co-expression in some cells. The cells’ doublet scores are derived based on the co-expression of pairs of genes. In the binary classification based approach, artificial doublet clusters are generated and cells are difficult to separate from the artificial doublets get higher doublet scores. library(scds) #- Annotate doublet using co-expression based doublet scoring: umi = cxds(umi) #- Annotate doublet using binary classification based doublet scoring: umi = bcds(umi) ## [1] train-error:0.066402+0.005931 test-error:0.103638+0.025177 ## Multiple eval metrics are present. Will use test_error for early stopping. ## Will train until test_error hasn&#39;t improved in 2 rounds. ## ## [2] train-error:0.049912+0.002718 test-error:0.085687+0.015561 ## [3] train-error:0.036747+0.002381 test-error:0.079870+0.015979 ## [4] train-error:0.029948+0.002910 test-error:0.071214+0.015731 ## [5] train-error:0.026042+0.003332 test-error:0.072934+0.012966 ## [6] train-error:0.022424+0.002773 test-error:0.064263+0.011326 ## [7] train-error:0.019385+0.003075 test-error:0.058461+0.012930 ## [8] train-error:0.018084+0.001510 test-error:0.056722+0.009848 ## [9] train-error:0.016203+0.002440 test-error:0.052094+0.012606 ## [10] train-error:0.013310+0.003702 test-error:0.049775+0.013542 ## [11] train-error:0.011574+0.002856 test-error:0.046891+0.012536 ## [12] train-error:0.010705+0.001959 test-error:0.046302+0.011468 ## [13] train-error:0.009693+0.001081 test-error:0.043413+0.012598 ## [14] train-error:0.008826+0.000545 test-error:0.042254+0.011269 ## [15] train-error:0.007813+0.000709 test-error:0.040515+0.011465 ## [16] train-error:0.006655+0.000705 test-error:0.042249+0.011692 ## [17] train-error:0.005498+0.000584 test-error:0.042254+0.012804 ## Stopping. Best iteration: ## [15] train-error:0.007813+0.000709 test-error:0.040515+0.011465 ## ## [1] train-error:0.068866 ## Will train until train_error hasn&#39;t improved in 2 rounds. ## ## [2] train-error:0.038194 ## [3] train-error:0.034144 ## [4] train-error:0.026620 ## [5] train-error:0.024884 ## [6] train-error:0.020255 ## [7] train-error:0.020255 ## [8] train-error:0.019097 ## [9] train-error:0.016782 ## [10] train-error:0.013310 #- Combine both annotations into a hybrid annotation umi = cxds_bcds_hybrid(umi) #- Doublet scores are now available via colData: CD = colData(umi) head(cbind(CD$cxds_score,CD$bcds_score, CD$hybrid_score)) ## [,1] [,2] [,3] ## NA19098.r1.A01 4131.405 0.012810320 0.2455279 ## NA19098.r1.A02 4564.089 0.004930826 0.2628488 ## NA19098.r1.A03 2827.904 0.020504847 0.1769445 ## NA19098.r1.A04 4708.213 0.009793874 0.2762736 ## NA19098.r1.A05 6134.590 0.006487557 0.3565501 ## NA19098.r1.A06 5810.730 0.008279418 0.3393878 plotColData( umi, x = &quot;total_features_by_counts&quot;, y = &quot;pct_counts_ERCC&quot;, colour = &quot;hybrid_score&quot; ) The scds paper features excellent descriptions and evaluations of other currently-available doublet detection methods. 6.3.2 DoubletDetection DoubletDetection is a python module that runs on raw UMI counts data. It generates artificial doublets and then perform cell clustering using the augmented dataset. Cells cluster closely to the artificial doublets across multiple iterations are predicted to be doublets. We provided the python scripts for running DoubletDetection on Tung datasets at ./mig_2019_scrnaseq-workshop/course_files/utils/run_doubletDetection.py python run_doubletDetection.py Here is the prediction results by DoubletDetection: require(UpSetR) ## Loading required package: UpSetR pred_tung &lt;- read.delim(file = &quot;data/doublets/tung.dbls.txt&quot;, header = FALSE) dim(pred_tung) ## [1] 864 1 dim(anno) ## [1] 864 5 umi$dbd_dbl &lt;- factor(pred_tung$V1) qc_label &lt;- read.delim(file = &quot;data/qc_ipsc.txt&quot;) head(qc_label) ## individual replicate well cell_number concentration tra1.60 ## 1 NA19098 r1 A01 1 1.734785 1 ## 2 NA19098 r1 A02 1 1.723038 1 ## 3 NA19098 r1 A03 1 1.512786 1 ## 4 NA19098 r1 A04 1 1.347492 1 ## 5 NA19098 r1 A05 1 2.313047 1 ## 6 NA19098 r1 A06 1 2.056803 1 qc_label$sample_id &lt;- paste0(qc_label$individual,&quot;.&quot;,qc_label$replicate,&quot;.&quot;,qc_label$well) rownames(qc_label) &lt;- qc_label$sample_id umi$cell_number &lt;- as.character(qc_label[umi$sample_id,&quot;cell_number&quot;]) umi$cell_number[qc_label$cell_number==0] &lt;- &quot;no_cell&quot; umi$cell_number[qc_label$cell_number == 1] &lt;- &quot;single_cell&quot; umi$cell_number[qc_label$cell_number&gt;1] &lt;- &quot;multi_cell&quot; multiplot(plotColData( umi, x = &quot;total_features_by_counts&quot;, y = &quot;pct_counts_ERCC&quot;, colour = &quot;hybrid_score&quot; ), plotColData( umi, x = &quot;total_features_by_counts&quot;, y = &quot;pct_counts_ERCC&quot;, colour = &quot;dbd_dbl&quot; ), plotColData( umi, x = &quot;total_features_by_counts&quot;, y = &quot;pct_counts_ERCC&quot;, colour = &quot;cell_number&quot; ),cols =2) doublets &lt;- unique(umi$sample_id[umi$dbd_dbl ==&quot;1&quot;], umi$sample_id[umi$hybrid_score &gt; 0.8]) pl_list &lt;- UpSetR::fromList(list(pred = doublets,qc_label = qc_label$sample_id[qc_label$cell_number &gt;1])) UpSetR::upset(pl_list,sets = c(&quot;pred&quot;,&quot;qc_label&quot;)) 6.3.2.1 Other tools available: DoubletFinder DoubletCells as part of SimpleSingleCell Scrublet 6.4 Gene QC 6.4.1 Gene expression In addition to removing cells with poor quality, it is usually a good idea to exclude genes where we suspect that technical artefacts may have skewed the results. Moreover, inspection of the gene expression profiles may provide insights about how the experimental procedures could be improved. It is often instructive to consider the number of reads consumed by the top 50 expressed genes. plotHighestExprs(umi, exprs_values = &quot;counts&quot;) Figure 6.6: Number of total counts consumed by the top 50 expressed genes The distributions are relatively flat indicating (but not guaranteeing!) good coverage of the full transcriptome of these cells. However, there are several spike-ins in the top 15 genes which suggests a greater dilution of the spike-ins may be preferrable if the experiment is to be repeated. 6.4.2 Gene filtering It is typically a good idea to remove genes whose expression level is considered “undetectable”. We define a gene as detectable if at least two cells contain more than 1 transcript from the gene. If we were considering read counts rather than UMI counts a reasonable threshold is to require at least five reads in at least two cells. However, in both cases the threshold strongly depends on the sequencing depth. It is important to keep in mind that genes must be filtered after cell filtering since some genes may only be detected in poor quality cells (note colData(umi)$use filter applied to the umi dataset). keep_feature &lt;- nexprs( umi[,colData(umi)$use], byrow = TRUE, detection_limit = 1 ) &gt;= 2 rowData(umi)$use &lt;- keep_feature table(keep_feature) ## keep_feature ## FALSE TRUE ## 4660 14066 Depending on the cell-type, protocol and sequencing depth, other cut-offs may be appropriate. 6.4.3 Save the data Dimensions of the QCed dataset (do not forget about the gene filter we defined above): dim(umi[rowData(umi)$use, colData(umi)$use]) ## [1] 14066 657 Let’s create an additional slot with log-transformed counts (we will need it in the next chapters) and remove saved PCA results from the reducedDim slot: assay(umi, &quot;logcounts_raw&quot;) &lt;- log2(counts(umi) + 1) reducedDim(umi) &lt;- NULL Save the data: saveRDS(umi, file = &quot;data/tung/umi.rds&quot;) 6.4.4 Big Exercise Perform exactly the same QC analysis with read counts of the same Blischak data. Use tung/reads.txt file to load the reads. Once you have finished please compare your results to ours (next chapter). 6.4.5 sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] UpSetR_1.4.0 scds_1.0.0 ## [3] limma_3.40.6 scater_1.12.2 ## [5] ggplot2_3.2.1 SingleCellExperiment_1.6.0 ## [7] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 ## [9] BiocParallel_1.18.1 matrixStats_0.55.0 ## [11] Biobase_2.44.0 GenomicRanges_1.36.1 ## [13] GenomeInfoDb_1.20.0 IRanges_2.18.3 ## [15] S4Vectors_0.22.1 BiocGenerics_0.30.0 ## ## loaded via a namespace (and not attached): ## [1] ggbeeswarm_0.6.0 colorspace_1.4-1 ## [3] mvoutlier_2.0.9 class_7.3-15 ## [5] modeltools_0.2-22 rio_0.5.16 ## [7] mclust_5.4.5 XVector_0.24.0 ## [9] pls_2.7-1 BiocNeighbors_1.2.0 ## [11] cvTools_0.3.2 flexmix_2.3-15 ## [13] mvtnorm_1.0-11 ranger_0.11.2 ## [15] splines_3.6.0 sROC_0.1-2 ## [17] robustbase_0.93-5 knitr_1.25 ## [19] zeallot_0.1.0 robCompositions_2.1.0 ## [21] kernlab_0.9-27 cluster_2.1.0 ## [23] rrcov_1.4-7 compiler_3.6.0 ## [25] backports_1.1.4 assertthat_0.2.1 ## [27] Matrix_1.2-17 lazyeval_0.2.2 ## [29] BiocSingular_1.0.0 htmltools_0.3.6 ## [31] tools_3.6.0 rsvd_1.0.2 ## [33] gtable_0.3.0 glue_1.3.1 ## [35] GenomeInfoDbData_1.2.1 dplyr_0.8.3 ## [37] Rcpp_1.0.2 carData_3.0-2 ## [39] cellranger_1.1.0 zCompositions_1.3.2-1 ## [41] vctrs_0.2.0 sgeostat_1.0-27 ## [43] fpc_2.2-3 DelayedMatrixStats_1.6.1 ## [45] lmtest_0.9-37 xfun_0.9 ## [47] laeken_0.5.0 stringr_1.4.0 ## [49] openxlsx_4.1.0.1 lifecycle_0.1.0 ## [51] irlba_2.3.3 DEoptimR_1.0-8 ## [53] zlibbioc_1.30.0 MASS_7.3-51.1 ## [55] zoo_1.8-6 scales_1.0.0 ## [57] VIM_4.8.0 hms_0.5.1 ## [59] RColorBrewer_1.1-2 yaml_2.2.0 ## [61] curl_4.2 NADA_1.6-1 ## [63] gridExtra_2.3 reshape_0.8.8 ## [65] stringi_1.4.3 highr_0.8 ## [67] pcaPP_1.9-73 e1071_1.7-2 ## [69] boot_1.3-20 zip_2.0.4 ## [71] truncnorm_1.0-8 rlang_0.4.0 ## [73] pkgconfig_2.0.3 prabclus_2.3-1 ## [75] bitops_1.0-6 evaluate_0.14 ## [77] lattice_0.20-38 purrr_0.3.2 ## [79] labeling_0.3 cowplot_1.0.0 ## [81] tidyselect_0.2.5 GGally_1.4.0 ## [83] plyr_1.8.4 magrittr_1.5 ## [85] bookdown_0.13 R6_2.4.0 ## [87] pillar_1.4.2 haven_2.1.1 ## [89] foreign_0.8-70 withr_2.1.2 ## [91] survival_2.43-3 abind_1.4-5 ## [93] RCurl_1.95-4.12 sp_1.3-1 ## [95] nnet_7.3-12 tibble_2.1.3 ## [97] crayon_1.3.4 car_3.0-3 ## [99] xgboost_0.90.0.2 rmarkdown_1.15 ## [101] viridis_0.5.1 grid_3.6.0 ## [103] readxl_1.3.1 data.table_1.12.2 ## [105] forcats_0.4.0 diptest_0.75-7 ## [107] vcd_1.4-4 digest_0.6.21 ## [109] tidyr_1.0.0 munsell_0.5.0 ## [111] beeswarm_0.2.3 viridisLite_0.3.0 ## [113] vipor_0.4.5 6.5 Exercise: Expression QC (Reads) library(SingleCellExperiment) library(scater) options(stringsAsFactors = FALSE) reads &lt;- read.table(&quot;data/tung/reads.txt&quot;, sep = &quot;\\t&quot;) anno &lt;- read.table(&quot;data/tung/annotation.txt&quot;, sep = &quot;\\t&quot;, header = TRUE) head(reads[ , 1:3]) ## NA19098.r1.A01 NA19098.r1.A02 NA19098.r1.A03 ## ENSG00000237683 0 0 0 ## ENSG00000187634 0 0 0 ## ENSG00000188976 57 140 1 ## ENSG00000187961 0 0 0 ## ENSG00000187583 0 0 0 ## ENSG00000187642 0 0 0 head(anno) ## individual replicate well batch sample_id ## 1 NA19098 r1 A01 NA19098.r1 NA19098.r1.A01 ## 2 NA19098 r1 A02 NA19098.r1 NA19098.r1.A02 ## 3 NA19098 r1 A03 NA19098.r1 NA19098.r1.A03 ## 4 NA19098 r1 A04 NA19098.r1 NA19098.r1.A04 ## 5 NA19098 r1 A05 NA19098.r1 NA19098.r1.A05 ## 6 NA19098 r1 A06 NA19098.r1 NA19098.r1.A06 reads &lt;- SingleCellExperiment( assays = list(counts = as.matrix(reads)), colData = anno ) keep_feature &lt;- rowSums(counts(reads) &gt; 0) &gt; 0 reads &lt;- reads[keep_feature, ] isSpike(reads, &quot;ERCC&quot;) &lt;- grepl(&quot;^ERCC-&quot;, rownames(reads)) isSpike(reads, &quot;MT&quot;) &lt;- rownames(reads) %in% c(&quot;ENSG00000198899&quot;, &quot;ENSG00000198727&quot;, &quot;ENSG00000198888&quot;, &quot;ENSG00000198886&quot;, &quot;ENSG00000212907&quot;, &quot;ENSG00000198786&quot;, &quot;ENSG00000198695&quot;, &quot;ENSG00000198712&quot;, &quot;ENSG00000198804&quot;, &quot;ENSG00000198763&quot;, &quot;ENSG00000228253&quot;, &quot;ENSG00000198938&quot;, &quot;ENSG00000198840&quot;) reads &lt;- calculateQCMetrics( reads, feature_controls = list( ERCC = isSpike(reads, &quot;ERCC&quot;), MT = isSpike(reads, &quot;MT&quot;) ) ) ## Warning in calculateQCMetrics(reads, feature_controls = list(ERCC = ## isSpike(reads, : spike-in set &#39;ERCC&#39; overwritten by feature_controls set of ## the same name hist( reads$total_counts, breaks = 100 ) abline(v = 1.3e6, col = &quot;red&quot;) Figure 6.7: Histogram of library sizes for all cells filter_by_total_counts &lt;- (reads$total_counts &gt; 1.3e6) table(filter_by_total_counts) ## filter_by_total_counts ## FALSE TRUE ## 180 684 hist( reads$total_features_by_counts, breaks = 100 ) abline(v = 7000, col = &quot;red&quot;) Figure 6.8: Histogram of the number of detected genes in all cells filter_by_expr_features &lt;- (reads$total_features_by_counts &gt; 7000) table(filter_by_expr_features) ## filter_by_expr_features ## FALSE TRUE ## 116 748 plotColData( reads, x = &quot;total_features_by_counts&quot;, y = &quot;pct_counts_MT&quot;, colour = &quot;batch&quot; ) Figure 6.9: Percentage of counts in MT genes plotColData( reads, x = &quot;total_features_by_counts&quot;, y = &quot;pct_counts_ERCC&quot;, colour = &quot;batch&quot; ) Figure 6.10: Percentage of counts in ERCCs filter_by_ERCC &lt;- reads$batch != &quot;NA19098.r2&quot; &amp; reads$pct_counts_ERCC &lt; 25 table(filter_by_ERCC) ## filter_by_ERCC ## FALSE TRUE ## 103 761 filter_by_MT &lt;- reads$pct_counts_MT &lt; 30 table(filter_by_MT) ## filter_by_MT ## FALSE TRUE ## 18 846 reads$use &lt;- ( # sufficient features (genes) filter_by_expr_features &amp; # sufficient molecules counted filter_by_total_counts &amp; # sufficient endogenous RNA filter_by_ERCC &amp; # remove cells with unusual number of reads in MT genes filter_by_MT ) table(reads$use) ## ## FALSE TRUE ## 258 606 reads &lt;- runPCA( reads, use_coldata = TRUE, detect_outliers = TRUE ) reducedDimNames(reads) ## [1] &quot;PCA_coldata&quot; table(reads$outlier) ## ## FALSE TRUE ## 753 111 plotReducedDim( reads, use_dimred = &quot;PCA_coldata&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;use&quot;, colour_by = &quot;outlier&quot; ) library(limma) ## ## Attaching package: &#39;limma&#39; ## The following object is masked from &#39;package:scater&#39;: ## ## plotMDS ## The following object is masked from &#39;package:BiocGenerics&#39;: ## ## plotMA auto &lt;- colnames(reads)[reads$outlier] man &lt;- colnames(reads)[!reads$use] venn.diag &lt;- vennCounts( cbind(colnames(reads) %in% auto, colnames(reads) %in% man) ) vennDiagram( venn.diag, names = c(&quot;Automatic&quot;, &quot;Manual&quot;), circle.col = c(&quot;blue&quot;, &quot;green&quot;) ) Figure 6.11: Comparison of the default, automatic and manual cell filters plotHighestExprs(reads, exprs_values = &quot;counts&quot;) Figure 6.12: Number of total counts consumed by the top 50 expressed genes keep_feature &lt;- nexprs( reads[,colData(reads)$use], byrow = TRUE, detection_limit = 1 ) &gt;= 2 rowData(reads)$use &lt;- keep_feature table(keep_feature) ## keep_feature ## FALSE TRUE ## 2664 16062 dim(reads[rowData(reads)$use, colData(reads)$use]) ## [1] 16062 606 assay(reads, &quot;logcounts_raw&quot;) &lt;- log2(counts(reads) + 1) reducedDim(reads) &lt;- NULL saveRDS(reads, file = &quot;data/tung/reads.rds&quot;) By comparing Figure 6.5 and Figure 6.11, it is clear that the reads based filtering removed more cells than the UMI based analysis. If you go back and compare the results you should be able to conclude that the ERCC and MT filters are more strict for the reads-based analysis. sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] limma_3.40.6 scater_1.12.2 ## [3] ggplot2_3.2.1 SingleCellExperiment_1.6.0 ## [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 ## [7] BiocParallel_1.18.1 matrixStats_0.55.0 ## [9] Biobase_2.44.0 GenomicRanges_1.36.1 ## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 ## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 ## ## loaded via a namespace (and not attached): ## [1] ggbeeswarm_0.6.0 colorspace_1.4-1 ## [3] mvoutlier_2.0.9 class_7.3-15 ## [5] modeltools_0.2-22 rio_0.5.16 ## [7] mclust_5.4.5 XVector_0.24.0 ## [9] pls_2.7-1 BiocNeighbors_1.2.0 ## [11] cvTools_0.3.2 flexmix_2.3-15 ## [13] mvtnorm_1.0-11 ranger_0.11.2 ## [15] splines_3.6.0 sROC_0.1-2 ## [17] robustbase_0.93-5 knitr_1.25 ## [19] zeallot_0.1.0 robCompositions_2.1.0 ## [21] kernlab_0.9-27 cluster_2.1.0 ## [23] rrcov_1.4-7 compiler_3.6.0 ## [25] backports_1.1.4 assertthat_0.2.1 ## [27] Matrix_1.2-17 lazyeval_0.2.2 ## [29] BiocSingular_1.0.0 htmltools_0.3.6 ## [31] tools_3.6.0 rsvd_1.0.2 ## [33] gtable_0.3.0 glue_1.3.1 ## [35] GenomeInfoDbData_1.2.1 dplyr_0.8.3 ## [37] Rcpp_1.0.2 carData_3.0-2 ## [39] cellranger_1.1.0 zCompositions_1.3.2-1 ## [41] vctrs_0.2.0 sgeostat_1.0-27 ## [43] fpc_2.2-3 DelayedMatrixStats_1.6.1 ## [45] lmtest_0.9-37 xfun_0.9 ## [47] laeken_0.5.0 stringr_1.4.0 ## [49] openxlsx_4.1.0.1 lifecycle_0.1.0 ## [51] irlba_2.3.3 DEoptimR_1.0-8 ## [53] zlibbioc_1.30.0 MASS_7.3-51.1 ## [55] zoo_1.8-6 scales_1.0.0 ## [57] VIM_4.8.0 hms_0.5.1 ## [59] RColorBrewer_1.1-2 yaml_2.2.0 ## [61] curl_4.2 NADA_1.6-1 ## [63] gridExtra_2.3 reshape_0.8.8 ## [65] stringi_1.4.3 highr_0.8 ## [67] pcaPP_1.9-73 e1071_1.7-2 ## [69] boot_1.3-20 zip_2.0.4 ## [71] truncnorm_1.0-8 rlang_0.4.0 ## [73] pkgconfig_2.0.3 prabclus_2.3-1 ## [75] bitops_1.0-6 evaluate_0.14 ## [77] lattice_0.20-38 purrr_0.3.2 ## [79] labeling_0.3 cowplot_1.0.0 ## [81] tidyselect_0.2.5 GGally_1.4.0 ## [83] plyr_1.8.4 magrittr_1.5 ## [85] bookdown_0.13 R6_2.4.0 ## [87] pillar_1.4.2 haven_2.1.1 ## [89] foreign_0.8-70 withr_2.1.2 ## [91] survival_2.43-3 abind_1.4-5 ## [93] RCurl_1.95-4.12 sp_1.3-1 ## [95] nnet_7.3-12 tibble_2.1.3 ## [97] crayon_1.3.4 car_3.0-3 ## [99] rmarkdown_1.15 viridis_0.5.1 ## [101] grid_3.6.0 readxl_1.3.1 ## [103] data.table_1.12.2 forcats_0.4.0 ## [105] diptest_0.75-7 vcd_1.4-4 ## [107] digest_0.6.21 tidyr_1.0.0 ## [109] munsell_0.5.0 beeswarm_0.2.3 ## [111] viridisLite_0.3.0 vipor_0.4.5 6.6 Data visualization and exploratory data analysis 6.6.1 Introduction In this chapter we will continue to work with the filtered Tung dataset produced in the previous chapter. We will explore different ways of visualizing the data to allow you to asses what happened to the expression matrix after the quality control step. scater package provides several very useful functions to simplify visualisation. One important aspect of single-cell RNA-seq is to control for batch effects. Batch effects are technical artefacts that are added to the samples during handling. For example, if two sets of samples were prepared in different labs or even on different days in the same lab, then we may observe greater similarities between the samples that were handled together. In the worst case scenario, batch effects may be mistaken for true biological variation. Data visualisation can help to identify batch effects or other unwanted sources of variation that affect our observed gene expression measurements. The Tung data allows us to explore these issues in a controlled manner since some of the salient aspects of how the samples were handled have been recorded. Ideally, we expect to see batches from the same individual grouping together and distinct groups corresponding to each individual. Data visualisation and exploratory data analysis are invaluable for allowing us to get a “feel” for a dataset. This is an area of data analysis that is perhaps more art than science, but is a crucial aspect of single-cell QC and analysis. library(SingleCellExperiment) library(scater) options(stringsAsFactors = FALSE) umi &lt;- readRDS(&quot;data/tung/umi.rds&quot;) umi.qc &lt;- umi[rowData(umi)$use, colData(umi)$use] endog_genes &lt;- !rowData(umi.qc)$is_feature_control 6.6.2 PCA plot The easiest way to overview the data is by transforming it using the principal component analysis and then visualize the first two principal components. Principal component analysis (PCA) is a statistical procedure that uses a transformation to convert a set of observations into a set of values of linearly uncorrelated variables called principal components (PCs). The number of principal components is less than or equal to the number of original variables. Mathematically, the PCs correspond to the eigenvectors of the covariance matrix. The eigenvectors are sorted by eigenvalue so that the first principal component accounts for as much of the variability in the data as possible, and each succeeding component in turn has the highest variance possible under the constraint that it is orthogonal to the preceding components (the figure below is taken from here). Figure 6.13: Schematic representation of PCA dimensionality reduction 6.6.2.1 Before QC Without log-transformation: tmp &lt;- runPCA( umi[endog_genes, ], exprs_values = &quot;counts&quot; ) plotPCA( tmp, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) Figure 6.14: PCA plot of the tung data With log-transformation: tmp &lt;- runPCA( umi[endog_genes, ], exprs_values = &quot;logcounts_raw&quot; ) plotPCA( tmp, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) Figure 6.15: PCA plot of the tung data Clearly log-transformation is benefitial for our data - it reduces the variance on the first principal component and already separates some biological effects. Moreover, it makes the distribution of the expression values more normal. In the following analysis and chapters we will be using log-transformed raw counts by default. However, note that just a log-transformation is not enough to account for different technical factors between the cells (e.g. sequencing depth). Therefore, please do not use logcounts_raw for your downstream analysis, instead as a minimum suitable data use the logcounts slot of the SingleCellExperiment object, which not just log-transformed, but also normalised by library size (e.g. CPM normalisation). In the course we use logcounts_raw only for demonstration purposes! 6.6.2.2 After QC tmp &lt;- runPCA( umi.qc[endog_genes, ], exprs_values = &quot;logcounts_raw&quot; ) plotPCA( tmp, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) Figure 6.16: PCA plot of the tung data Comparing Figure 6.15 and Figure 6.16, it is clear that after quality control the NA19098.r2 cells no longer form a group of outliers. By default only the top 500 most variable genes are used by scater to calculate the PCA. This can be adjusted by changing the ntop argument. Exercise 1 How do the PCA plots change if when all 14,066 genes are used? Or when only top 50 genes are used? Why does the fraction of variance accounted for by the first PC change so dramatically? Hint Use ntop argument of the plotPCA function. Our answer Figure 6.17: PCA plot of the tung data (14214 genes) Figure 6.18: PCA plot of the tung data (50 genes) If your answers are different please compare your code with ours (you need to search for this exercise in the opened file). 6.6.3 tSNE map An alternative to PCA for visualizing scRNA-seq data is a tSNE plot. tSNE (t-Distributed Stochastic Neighbor Embedding) converts high-dimensional Euclidean distances between datapoints into conditional probabilities that represent similarities, to produce a low-dimensional representation of high-dimensional data that displays large- and local-scale structure in the dataset. Here, we map high dimensional data ( i.e. our 14,214 dimensional expression matrix) to a 2-dimensional space while preserving local distances between cells. tSNE is almost always used to produce a two-dimensional representation of a high-dimensional dataset; it is only rarely used to generate a reduced-dimension space with more than two dimensions and is typically used only for visulisation as opposed being used as a general dimension-reduction method. Due to the non-linear and stochastic nature of the algorithm, tSNE is more difficult to intuitively interpret than a standard dimensionality reduction method such as PCA. Things to be aware of when using tSNE: tSNE has a tendency to (visually) cluster points; as such, it often creates attractive plots of datasets with distinct cell types, but does look as good when there are continuous changes in the cell population. The hyperparameters really matter: in particular, changing the perplexity parameter can have a large effect on the visulisation produced. Perplexity is a measure of information, but can loosely be thought of as a tuning parameter that controls the number of nearest neighbous for each datapoint. Cluster sizes in a tSNE plot mean nothing. Distances between clusters might not mean anything. Random noise doesn’t always look random. You can see some shapes, sometimes. For more details about how to use tSNE effectively, see this exellent article. In contrast with PCA, tSNE is a stochastic algorithm which means running the method multiple times on the same dataset will result in different plots. To ensure reproducibility, we fix the “seed” of the random-number generator in the code below so that we always get the same plot. 6.6.3.1 Before QC set.seed(123456) tmp &lt;- runTSNE( umi[endog_genes, ], exprs_values = &quot;logcounts_raw&quot;, perplexity = 130 ) plotTSNE( tmp, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) Figure 6.19: tSNE map of the tung data 6.6.3.2 After QC set.seed(123456) tmp &lt;- runTSNE( umi.qc[endog_genes, ], exprs_values = &quot;logcounts_raw&quot;, perplexity = 130 ) plotTSNE( tmp, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) Figure 6.20: tSNE map of the tung data Interpreting PCA and tSNE plots is often challenging and due to their stochastic and non-linear nature, they are less intuitive. However, in this case it is clear that they provide a similar picture of the data. Comparing Figure 6.19 and 6.20, it is again clear that the samples from NA19098.r2 are no longer outliers after the QC filtering. Furthermore tSNE requires you to provide a value of perplexity which reflects the number of neighbours used to build the nearest-neighbour network; a high value creates a dense network which clumps cells together while a low value makes the network more sparse allowing groups of cells to separate from each other. scater uses a default perplexity of the total number of cells divided by five (rounded down). You can read more about the pitfalls of using tSNE here. UMAP (Uniform Manifold Approximation and Projection) is a newer alternative to tSNE which also often creates attractive visualisations of scRNA-seq data with the benefit of being faster than tSNE to compute and is a “true” dimensionality reduction method. We will look at PCA, tSNE and UMAP plots in subsequent chapters and discuss the topic of dimensionality reduction further in the Latent spaces chapter. Exercise 2 How do the tSNE plots change when a perplexity of 10 or 200 is used? How does the choice of perplexity affect the interpretation of the results? Our answer Figure 6.21: tSNE map of the tung data (perplexity = 10) Figure 6.22: tSNE map of the tung data (perplexity = 200) 6.6.4 Big Exercise Perform the same analysis with read counts of the Blischak data. Use tung/reads.rds file to load the reads SCE object. Once you have finished please compare your results to ours (next chapter). 6.6.5 sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] scater_1.12.2 ggplot2_3.2.1 ## [3] SingleCellExperiment_1.6.0 SummarizedExperiment_1.14.1 ## [5] DelayedArray_0.10.0 BiocParallel_1.18.1 ## [7] matrixStats_0.55.0 Biobase_2.44.0 ## [9] GenomicRanges_1.36.1 GenomeInfoDb_1.20.0 ## [11] IRanges_2.18.3 S4Vectors_0.22.1 ## [13] BiocGenerics_0.30.0 ## ## loaded via a namespace (and not attached): ## [1] Rcpp_1.0.2 rsvd_1.0.2 ## [3] lattice_0.20-38 assertthat_0.2.1 ## [5] digest_0.6.21 R6_2.4.0 ## [7] evaluate_0.14 highr_0.8 ## [9] pillar_1.4.2 zlibbioc_1.30.0 ## [11] rlang_0.4.0 lazyeval_0.2.2 ## [13] irlba_2.3.3 Matrix_1.2-17 ## [15] rmarkdown_1.15 BiocNeighbors_1.2.0 ## [17] labeling_0.3 Rtsne_0.15 ## [19] stringr_1.4.0 RCurl_1.95-4.12 ## [21] munsell_0.5.0 compiler_3.6.0 ## [23] vipor_0.4.5 BiocSingular_1.0.0 ## [25] xfun_0.9 pkgconfig_2.0.3 ## [27] ggbeeswarm_0.6.0 htmltools_0.3.6 ## [29] tidyselect_0.2.5 tibble_2.1.3 ## [31] gridExtra_2.3 GenomeInfoDbData_1.2.1 ## [33] bookdown_0.13 viridisLite_0.3.0 ## [35] crayon_1.3.4 dplyr_0.8.3 ## [37] withr_2.1.2 bitops_1.0-6 ## [39] grid_3.6.0 gtable_0.3.0 ## [41] magrittr_1.5 scales_1.0.0 ## [43] stringi_1.4.3 XVector_0.24.0 ## [45] viridis_0.5.1 DelayedMatrixStats_1.6.1 ## [47] cowplot_1.0.0 tools_3.6.0 ## [49] glue_1.3.1 beeswarm_0.2.3 ## [51] purrr_0.3.2 yaml_2.2.0 ## [53] colorspace_1.4-1 knitr_1.25 6.7 Exercise: Data visualization (Reads) library(scater) options(stringsAsFactors = FALSE) reads &lt;- readRDS(&quot;data/tung/reads.rds&quot;) reads.qc &lt;- reads[rowData(reads)$use, colData(reads)$use] endog_genes &lt;- !rowData(reads.qc)$is_feature_control tmp &lt;- runPCA( reads[endog_genes, ], exprs_values = &quot;counts&quot; ) plotPCA( tmp, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) Figure 6.23: PCA plot of the tung data tmp &lt;- runPCA( reads[endog_genes, ], exprs_values = &quot;logcounts_raw&quot; ) plotPCA( tmp, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) Figure 6.24: PCA plot of the tung data tmp &lt;- runPCA( reads.qc[endog_genes, ], exprs_values = &quot;logcounts_raw&quot; ) plotPCA( tmp, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) Figure 6.25: PCA plot of the tung data set.seed(123456) tmp &lt;- runTSNE( reads[endog_genes, ], exprs_values = &quot;logcounts_raw&quot;, perplexity = 130 ) plotTSNE( tmp, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) Figure 6.26: tSNE map of the tung data set.seed(123456) tmp &lt;- runTSNE( reads.qc[endog_genes, ], exprs_values = &quot;logcounts_raw&quot;, perplexity = 130 ) plotTSNE( tmp, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) Figure 6.27: tSNE map of the tung data Figure 6.21: tSNE map of the tung data (perplexity = 10) Figure 6.22: tSNE map of the tung data (perplexity = 200) sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] scater_1.12.2 ggplot2_3.2.1 ## [3] SingleCellExperiment_1.6.0 SummarizedExperiment_1.14.1 ## [5] DelayedArray_0.10.0 BiocParallel_1.18.1 ## [7] matrixStats_0.55.0 Biobase_2.44.0 ## [9] GenomicRanges_1.36.1 GenomeInfoDb_1.20.0 ## [11] IRanges_2.18.3 S4Vectors_0.22.1 ## [13] BiocGenerics_0.30.0 ## ## loaded via a namespace (and not attached): ## [1] Rcpp_1.0.2 rsvd_1.0.2 ## [3] lattice_0.20-38 assertthat_0.2.1 ## [5] digest_0.6.21 R6_2.4.0 ## [7] evaluate_0.14 highr_0.8 ## [9] pillar_1.4.2 zlibbioc_1.30.0 ## [11] rlang_0.4.0 lazyeval_0.2.2 ## [13] irlba_2.3.3 Matrix_1.2-17 ## [15] rmarkdown_1.15 BiocNeighbors_1.2.0 ## [17] labeling_0.3 Rtsne_0.15 ## [19] stringr_1.4.0 RCurl_1.95-4.12 ## [21] munsell_0.5.0 compiler_3.6.0 ## [23] vipor_0.4.5 BiocSingular_1.0.0 ## [25] xfun_0.9 pkgconfig_2.0.3 ## [27] ggbeeswarm_0.6.0 htmltools_0.3.6 ## [29] tidyselect_0.2.5 tibble_2.1.3 ## [31] gridExtra_2.3 GenomeInfoDbData_1.2.1 ## [33] bookdown_0.13 viridisLite_0.3.0 ## [35] crayon_1.3.4 dplyr_0.8.3 ## [37] withr_2.1.2 bitops_1.0-6 ## [39] grid_3.6.0 gtable_0.3.0 ## [41] magrittr_1.5 scales_1.0.0 ## [43] stringi_1.4.3 XVector_0.24.0 ## [45] viridis_0.5.1 DelayedMatrixStats_1.6.1 ## [47] cowplot_1.0.0 tools_3.6.0 ## [49] glue_1.3.1 beeswarm_0.2.3 ## [51] purrr_0.3.2 yaml_2.2.0 ## [53] colorspace_1.4-1 knitr_1.25 References "],
 ["normalization-confounders-and-batch-correction.html", "7 Normalization, confounders and batch correction 7.1 Normalization theory 7.2 Normalization practice (UMI) 7.3 Normalization practice (Reads) 7.4 Identifying confounding factors 7.5 Identifying confounding factors (Reads) 7.6 Batch effects 7.7 Dealing with confounders (Reads) 7.8 Feature Selection", " 7 Normalization, confounders and batch correction 7.1 Normalization theory 7.1.1 Introduction In this chapter, we will explore approaches to normalization, confounder identification and batch correction for scRNA-seq data. Even in the absence of specific confounding factors, thoughtful normalization of scRNA-seq data is required. The raw count values are not directly comparable between cells, because in general the sequencing depth (number of reads obtained; often called library size) is very different across cells—orders-of-magnitude differences in sequencing depth are commonly observed between cells in an scRNA-seq dataset. If ignored, or not handled correctly, library size differences can be the dominant source of variation between single-cell gene expression profiles, obscuring the biological signal of interest. n Related to library size, differences in library composition can also cause problems when we are trying to compare expression profiles between cells. Normalization can and should also account for differences in library composition. In addition to normalization, it is also useful to identify confounding factors so that they can be accounted for in downstream analyses. In many cases, “accounting” for confounding variables may involve incorporating them as variables in a particular statistical model (e.g. in a differential expression model). In other cases, it may be desirable to “regress out” (either in a literal or figurative sense) confounding factors—the challenge for scRNA-seq data is finding the right model and/or data transformation such that regressing out confounding factors would work as desired. We discuss this further below. The issue of batch effects is just as important for scRNA-seq data as it is in other areas of genomics. Briefly, scRNA-seq and other ’omics assays are sensitive to minor differences in technical features of data generation. As such, even when assaying the same experimental or biological system, measurements taken at difference times and places or by different people will differ substantially. To make valid comparisons between cells, samples or groups, we first need to design our studies to be robust to batch effects and then we need to treat batch effects appropriately in our analyses. In the following sections, we will explore simple size-factor normalizations correcting for library size and composition and also discuss a more recent, conceptually quite different, approach to tackling the problem of library size differences between cells. 7.1.2 Library size Library sizes vary because scRNA-seq data is often sequenced on highly multiplexed platforms the total reads which are derived from each cell may differ substantially. Most scRNA-seq platforms and/or quantification methods currently available produce count values as the “raw”, “observed”, gene expression values. For such count data, the library size must be corrected for as part of data normalization. One popular strategy, borrowed and extended from the analysis of bulk RNA-seq data, is to multiply or divide each column of the expression matrix (in our setup columns correspond to cells) by a “normalization factor” which is an estimate of the library size relative to the other cells. Many methods to correct for library size have been developed for bulk RNA-seq and can be equally applied to scRNA-seq (eg. UQ, SF, CPM, RPKM, FPKM, TPM). In addition, single-cell specific size-factor normalization methods have been proposed to better handle the characteristics of scRNA-seq data (namely greater sparsity/proportion of zero counts). We will demonstrate use of the size-factor normalization method from the scran package in this chapter. A conceptually different approach to normalization of scRNA-seq data was proposed earlier in 2019 by (Hafemeister and Satija 2019). The idea behind the sctransform approach is to fit a regularized negative binomial model to the raw count data, with library size as the only explanatory variable in the model. The residuals from this model can then be used as normalized and variance-stabilized expression values. We show the use of this method too in this chapter. Some quantification methods (particularly those that quantify transcript-level expression, e.g. Salmon, kallisto) return transcripts-per-million values, TPM (instead of or in addition to count values), which effectively incorporate library size when determining gene expression estimates and thus do not require subsequent normalization for library size. However, TPM values may still be susceptible to library composition biases and so normalization may still be required. 7.1.3 Scaling or size-factor normalization methods The normalization methods discussed in this section all involve dividing the counts for each cell by a constant value to account for library size and, in some cases, library composition. These methods will typically give (adjusted/normalized) counts-per-million (CPM) or transcripts-per-million (TPM) values. Ideally, after applying one of these scaling/size-factor normalization methods, the CPM/TPM values produced are comparable across cells, with the effects of sequencing depth removed. However, even if this is true (i.e. the normalization has worked well), the CPM/TPM values do not have stable variance. Specifically, as the size of the values increases, so does the variance. This feature of the data (heteroskedacity, or asymmetric, heavy-tailed distributions) is problematic for statistical analysis methods that assume homoskedacity, that is that there is no relationship between the mean of expression values and their variance (i.e. just about anything that uses a Gaussian error model). As such, we should apply a variance stabilizing transformation to these data so that we can use standard statistical methods like linear regression and PCA with confidence. Developing a thoroughly effective variance stabilizing transformation is a challenge, so almost universally a log transformation (typically log2) is applied to the CPM/TPM values (the logcounts slot in a SingleCellExperiment object is expected to contain (normalized) log2-scale CPM/TPM values). For high-depth cells and highly-expressed genes this transformation generally works well (as for bulk RNA-seq data), but, as we will discuss below, it often performs sub-optimally for (sparse) scRNA-seq data. 7.1.3.1 CPM The simplest way to normalize this data is to convert it to counts per million (CPM) by dividing each column by its total then multiplying by 1,000,000. Note that spike-ins should be excluded from the calculation of total expression in order to correct for total cell RNA content, therefore we will only use endogenous genes. Example of a CPM function in R (using the scater package): calc_cpm &lt;- function (expr_mat, spikes = NULL) { norm_factor &lt;- colSums(expr_mat[-spikes, ]) return(t(t(expr_mat)/norm_factor)) * 10^6 } One potential drawback of CPM is if your sample contains genes that are both very highly expressed and differentially expressed across the cells. In this case, the total molecules in the cell may depend of whether such genes are on/off in the cell and normalizing by total molecules may hide the differential expression of those genes and/or falsely create differential expression for the remaining genes. Note RPKM, FPKM and TPM are variants on CPM which further adjust counts by the length of the respective gene/transcript. TPM is usually a direct output of a transcript expression quantification method (e.g. Salmon, kallisto, etc). To deal with this potentiality several other measures were devised. 7.1.3.2 RLE (SF) The size factor (SF) was proposed and popularized by DESeq (Anders and Huber 2010). First the geometric mean of each gene across all cells is calculated. The size factor for each cell is the median across genes of the ratio of the expression to the gene’s geometric mean. A drawback to this method is that since it uses the geometric mean only genes with non-zero expression values across all cells can be used in its calculation, making it unadvisable for large low-depth scRNASeq experiments. edgeR &amp; scater call this method RLE for “relative log expression” (to distinguish it from the many other size-factor normalization methods that now exist). Example of a SF function in R (from the edgeR package): calc_sf &lt;- function (expr_mat, spikes = NULL) { geomeans &lt;- exp(rowMeans(log(expr_mat[-spikes, ]))) SF &lt;- function(cnts) { median((cnts/geomeans)[(is.finite(geomeans) &amp; geomeans &gt; 0)]) } norm_factor &lt;- apply(expr_mat[-spikes, ], 2, SF) return(t(t(expr_mat)/norm_factor)) } 7.1.3.3 UQ The upperquartile (UQ) was proposed by (Bullard et al. 2010). Here each column is divided by the 75% quantile of the counts for each library. Often the calculated quantile is scaled by the median across cells to keep the absolute level of expression relatively consistent. A drawback to this method is that for low-depth scRNASeq experiments the large number of undetected genes may result in the 75% quantile being zero (or close to it). This limitation can be overcome by generalizing the idea and using a higher quantile (eg. the 99% quantile is the default in scater) or by excluding zeros prior to calculating the 75% quantile. Example of a UQ function in R (again from the edgeR package): calc_uq &lt;- function (expr_mat, spikes = NULL) { UQ &lt;- function(x) { quantile(x[x &gt; 0], 0.75) } uq &lt;- unlist(apply(expr_mat[-spikes, ], 2, UQ)) norm_factor &lt;- uq/median(uq) return(t(t(expr_mat)/norm_factor)) } 7.1.3.4 TMM Another method is called TMM is the weighted trimmed mean of M-values (to the reference) proposed by (Robinson and Oshlack 2010). The M-values in question are the gene-wise log2-fold changes between individual cells. One cell is used as the reference then the M-values for each other cell is calculated compared to this reference. These values are then trimmed by removing the top and bottom ~30%, and the average of the remaining values is calculated by weighting them to account for the effect of the log scale on variance. Each non-reference cell is multiplied by the calculated factor. Two potential issues with this method are insufficient non-zero genes left after trimming, and the assumption that most genes are not differentially expressed. sizeFactors(umi.qc) &lt;- edgeR::calcNormFactors(counts(umi.qc), method = &quot;TMM&quot;) 7.1.3.5 scran scran package implements a variant on CPM size-factor normalization specialized for single-cell data (L. Lun, Bach, and Marioni 2016). Briefly this method deals with the problem of vary large numbers of zero values per cell by pooling cells together calculating a normalization factor (similar to TMM) for the sum of each pool. Since each cell is found in many different pools, cell-specific factors can be deconvoluted from the collection of pool-specific factors using linear algebra. This method applies a “quick cluster” method to get rough clusters of cells to pool together to apply the strategy outlined above. qclust &lt;- quickCluster(umi.qc, min.size = 30) umi.qc &lt;- computeSumFactors(umi.qc, sizes = 15, clusters = qclust) 7.1.4 sctransform The sctransform method is very different from the scaling/size-factor methods discussed above. In their paper, (Hafemeister and Satija 2019) argue that the log-transformation of (normalized) CPM values does not stabilise the variance of expression values, particularly in the case of sparse(r) UMI-count data. Figure 1 of their paper (reproduced below) sets out this argument that strong relationships exist between gene expression and total cell UMI count, even after applying a scaled log-normalization method. Figure 2.4: Reproduction of Figure 1 from Hafemeister and Satija (2019). 33,148 PBMC dataset from 10x genomics. A) Distribution of total UMI counts / cell (’sequencing depth’). B) We placed genesinto six groups, based on their average expression in the dataset. C) For each gene group, we examined the average relationship between observed counts and cell sequencing depth. We fit a smooth line for each gene individually and combined results based on the groupings in (B). Black line shows mean, colored region indicates interquartile range. D) Same as in (C), but showing scaled log-normalized values instead of UMI counts. Values were scaled (z-scored) so that a single y-axis range could be used. E) Relationship between gene variance and cell sequencing depth; Cells were placed into five equal-sized groups based on total UMI counts (group 1 has the greatest depth), and we calculated the total variance of each gene group within each bin. For effectively normalized data, each cell bin should contribute 20% to the variance of each gene group. One effect of the failure of the scaled log-normalization to remove the relationship between total cell UMI count and expression is that dimension-reduction methods (especially PCA) applied to the log-normalized data can return reduced dimension spaces where, very often, the first dimension is highly correlated with total cell UMI count to total cell genes expressed. This effect is noted and discussed by (William Townes et al. 2019). The sctransform solution is to fit a negative binomial (NB) generalized linear model to the UMI counts for each gene, with an intercept term and a coefficient for library size (specifically using log10(total cell UMI count) as a covariate) as parameters in the model. The negative binomial model can account for much more variance in the observed count data than a simpler model like the Poisson can. To avoid overfitting the model to the data, the gene-wise intercept, library size and overdispersion parameters are regularized by fitting a loess (locally-linear smoothing method) to the per-gene estimates from the GLM. Figure 2.5: Reproduction of Figure 2A from Hafemeister and Satija (2019). They fit NB regression models for each gene individually, and bootstrapped the process to measure uncertainty in the resulting parameter estimates. A) Model parameters for 16,809 genes for the NB regression model, plotted as a function of average gene abundance. The color of each point indicates a parameter uncertainty score as determined by bootstrapping (Methods). Pink line shows the regularized parameters obtained via kernel regression. The regularized NB GLM is presented as an attractive middle ground between the (underfit) Poisson model and the (overfit) unregularized NB model. The Pearson residuals from the regularized NB GLM are used as “normalized” expression values for downstream analyses. Figure 2.6: Reproduction of Figure 4 from Hafemeister and Satija (2019). A) For four genes, we show the relationship between cell sequencing depth and molecular counts. White points show the observed data. Background color represents the Pearson residual magnitude under three error models. For MALAT1 (does not vary across cell types) the Poisson error model does not account for overdispersion, and incorrectly infers significant residual variation (biological heterogeneity). For S100A9 (a CD14+ Monocyte marker) and CD74 (expressed in antigen-presenting cells) the non-regularized NB model overfits the data, and collapses biological heterogeneity. For PPBP (a Megakaryocyte marker) both non-regularized models wrongly fit a negative slope. B) Boxplot of Pearson residuals for models shown in A. X-axis range shown is limited to [-8, 25] for visual clarity. The regularized NB GLM also provides a natural way to do feature selection ( i.e. find informative genes) using the deviance of the fitted GLM for each gene. We discuss this further in the Feature Selection section. We find the Pearson residuals from sctransform to be highly suitable as input to visualisation (dimension reduction) and clustering methods. For several other analyses (e.g. differential expression analyses), where statistical models designed for sparse count data are available, we prefer to use approaches that work with the “raw” count data. We are not yet sure how well sctransform performs on full-length transcript (i.e. non-UMI) count data. 7.1.5 Downsampling A final way to correct for library size is to downsample the expression matrix so that each cell has approximately the same total number of molecules. The benefit of this method is that zero values will be introduced by the down sampling thus eliminating any biases due to differing numbers of detected genes. However, the major drawback is that the process is not deterministic so each time the downsampling is run the resulting expression matrix is slightly different. Thus, often analyses must be run on multiple downsamplings to ensure results are robust. Downsampling to the depth of the cell with the lowest sequencing depth (that still passes QC) will typically discard much (most) of the information gathered in a (typically expensive) scRNA-seq experiment. We view this as a heavy price to pay for a normalization method that generally does not seem to outperform alternatives. Thus, we would not recommend downsampling as a normalization strategy for scRNA-seq data unless all alternatives have failed. 7.1.6 Effectiveness To compare the efficiency of different normalization methods we will use visual inspection of PCA plots and calculation of cell-wise relative log expression via scater’s plotRLE() function. Namely, cells with many (few) reads have higher (lower) than median expression for most genes resulting in a positive (negative) RLE across the cell, whereas normalized cells have an RLE close to zero. Example of a RLE function in R: calc_cell_RLE &lt;- function (expr_mat, spikes = NULL) { RLE_gene &lt;- function(x) { if (median(unlist(x)) &gt; 0) { log((x + 1)/(median(unlist(x)) + 1))/log(2) } else { rep(NA, times = length(x)) } } if (!is.null(spikes)) { RLE_matrix &lt;- t(apply(expr_mat[-spikes, ], 1, RLE_gene)) } else { RLE_matrix &lt;- t(apply(expr_mat, 1, RLE_gene)) } cell_RLE &lt;- apply(RLE_matrix, 2, median, na.rm = T) return(cell_RLE) } Note The RLE, TMM, and UQ size-factor methods were developed for bulk RNA-seq data and, depending on the experimental context, may not be appropriate for single-cell RNA-seq data, as their underlying assumptions may be problematically violated. Note The calcNormFactors function from the edgeR package implements several library size normalization methods making it easy to apply any of these methods to our data. Note edgeR makes extra adjustments to some of the normalization methods which may result in somewhat different results than if the original methods are followed exactly, e.g. edgeR’s and scater’s “RLE” method which is based on the “size factor” used by DESeq may give different results to the estimateSizeFactorsForMatrix method in the DESeq/DESeq2 packages. In addition, some (earlier) versions of edgeR will not calculate the normalization factors correctly unless lib.size is set at 1 for all cells. Note For CPM normalisation we use scater’s calculateCPM() function. For RLE, UQ and TMM we used to use scater’s normaliseExprs() function, but it was deprecated and has been removed from the package). For scran we use the scran package to calculate size factors (it also operates on SingleCellExperiment class) and scater’s normalize() to normalise the data. All these normalization functions save the results to the logcounts slot of the SCE object. 7.2 Normalization practice (UMI) We will continue to work with the tung data that was used in the previous chapter. library(scRNA.seq.funcs) library(scater) library(scran) options(stringsAsFactors = FALSE) set.seed(1234567) umi &lt;- readRDS(&quot;data/tung/umi.rds&quot;) umi.qc &lt;- umi[rowData(umi)$use, colData(umi)$use] endog_genes &lt;- !rowData(umi.qc)$is_feature_control 7.2.1 Raw tmp &lt;- runPCA( umi.qc[endog_genes, ], exprs_values = &quot;logcounts_raw&quot; ) plotPCA( tmp, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) + ggtitle(&quot;PCA plot: raw log-counts&quot;) Figure 7.1: PCA plot of the tung data 7.2.2 CPM logcounts(umi.qc) &lt;- log2(calculateCPM(umi.qc, use_size_factors = FALSE) + 1) plotPCA( umi.qc[endog_genes, ], colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) + ggtitle(&quot;PCA plot: log2(CPM) values&quot;) Figure 7.2: PCA plot of the tung data after CPM normalisation plotRLE( umi.qc[endog_genes, ], exprs_values = &quot;logcounts_raw&quot;, colour_by = &quot;batch&quot; ) + ggtitle(&quot;RLE plot: raw log-counts&quot;) Figure 7.3: Cell-wise RLE of the tung data. The relative log expression profile of each cell is represented by a boxplot, which appears as a line here. The grey bar in the middle for each cell represent the interquartile range of the RLE values; the coloured lines represent the whiskers ofof a boxplot and extend above and below the grey bar by 1.5 times the interquartile range. The median RLE value is shown with a circle. plotRLE( umi.qc[endog_genes, ], exprs_values = &quot;logcounts&quot;, colour_by = &quot;batch&quot; ) + ggtitle(&quot;RLE plot: log2(CPM)&quot;) Figure 7.4: Cell-wise RLE of the tung data. The relative log expression profile of each cell is represented by a boxplot, which appears as a line here. The grey bar in the middle for each cell represent the interquartile range of the RLE values; the coloured lines represent the whiskers ofof a boxplot and extend above and below the grey bar by 1.5 times the interquartile range. The median RLE value is shown with a circle. Q: How well would you say the two approaches above normalize the data? 7.2.3 scran scran’s method for size-factor estimation will almost always be preferable for scRNA-seq data to methods that were developed for bulk RNA-seq data (TMM, RLE, UQ). Thus, we will just demonstrate the use of scran size-factor normalization here as representative of size-factor normalization more generally. The code below computes the size factors and then the normalize() function in scater applies those size factors along with the library sizes to the count matrix to produce normalized log2-counts-per-million values that are then stored in the logcounts slot of the SingleCellExperiment object. qclust &lt;- quickCluster(umi.qc, min.size = 30, use.ranks = FALSE) umi.qc &lt;- computeSumFactors(umi.qc, sizes = 15, clusters = qclust) umi.qc &lt;- normalize(umi.qc) plotPCA( umi.qc[endog_genes, ], colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) + ggtitle(&quot;PCA plot: scran size-factor normalization&quot;) Figure 7.5: PCA plot of the tung data after LSF normalisation plotRLE( umi.qc[endog_genes, ], exprs_values = &quot;logcounts&quot;, colour_by = &quot;batch&quot; ) + ggtitle(&quot;RLE plot: scran size-factor normalization&quot;) Figure 7.6: Cell-wise RLE of the tung data scran sometimes calculates negative or zero size factors. These will completely distort the normalized expression matrix. We can check the size factors scran has computed like so: summary(sizeFactors(umi.qc)) ## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.4836 0.7747 0.9532 1.0000 1.1483 3.2873 For this dataset all the size factors are reasonable so we are done. If you find scran has calculated negative size factors try increasing the cluster and pool sizes until they are all positive. We sometimes filter out cells with very large size-factors (you may like to think about why), but we will not demonstrate that here. 7.2.4 sctransform The sctransform approach to using Pearson residuals from an regularized negative binomial generalized linear model was introduced above. Here we demonstrate how to apply this method. Note that (due to what looks like a bug in this version of sctransform) we need to convert the UMI count matrix to a sparse format to apply sctransform. umi_sparse &lt;- as(counts(umi.qc), &quot;dgCMatrix&quot;) ### Genes expressed in at least 5 cells will be kept sctnorm_data &lt;- sctransform::vst(umi = umi_sparse, min_cells = 1, cell_attr = as.data.frame(colData(umi.qc)), latent_var = &quot;log10_total_counts_endogenous&quot;) ## Pearson residuals, or deviance residuals dim(sctnorm_data$y) ## [1] 14066 657 dim(umi.qc) ## [1] 14066 657 sctnorm_data$model_str ## [1] &quot;y ~ log10_total_counts_endogenous&quot; assay(umi.qc, &quot;sctrans_norm&quot;) &lt;- sctnorm_data$y Let us look at the NB GLM model parameters estimated by sctransform. #sce$log10_total_counts ## Matrix of estimated model parameters per gene (theta and regression coefficients) sctransform::plot_model_pars(sctnorm_data) We can look at the effect of sctransform’s normalization on three particular genes, ACTB, POU5F1 (aka OCT4) and CD74. ##c(&#39;ACTB&#39;, &#39;Rpl10&#39;, &#39;Cd74&#39;) genes_plot &lt;- c(&quot;ENSG00000075624&quot;, &quot;ENSG00000204531&quot;, &quot;ENSG00000019582&quot;) sctransform::plot_model(sctnorm_data, umi_sparse, genes_plot, plot_residual = TRUE, cell_attr = as.data.frame(colData(umi.qc))) reducedDim(umi.qc, &quot;PCA_sctrans_norm&quot;) &lt;- reducedDim( runPCA(umi.qc[endog_genes, ], exprs_values = &quot;sctrans_norm&quot;) ) plotReducedDim( umi.qc, use_dimred = &quot;PCA_sctrans_norm&quot;, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) + ggtitle(&quot;PCA plot: sctransform normalization&quot;) Figure 7.7: PCA plot of the tung data after sctransform normalisation (Pearson residuals). plotRLE( umi.qc[endog_genes, ], exprs_values = &quot;sctrans_norm&quot;, colour_by = &quot;batch&quot; ) + ggtitle(&quot;RLE plot: sctransform normalization&quot;) Figure 7.8: Cell-wise RLE of the tung data 7.2.5 Normalisation for gene/transcript length Some methods combine library size and fragment/gene length normalization such as: RPKM - Reads Per Kilobase Million (for single-end sequencing) FPKM - Fragments Per Kilobase Million (same as RPKM but for paired-end sequencing, makes sure that paired ends mapped to the same fragment are not counted twice) TPM - Transcripts Per Kilobase Million (same as RPKM, but the order of normalizations is reversed - length first and sequencing depth second) These methods are not applicable to our dataset since the end of the transcript which contains the UMI was preferentially sequenced. Furthermore in general these should only be calculated using appropriate quantification software from aligned BAM files not from read counts since often only a portion of the entire gene/transcript is sequenced, not the entire length. If in doubt check for a relationship between gene/transcript length and expression level. However, here we show how these normalisations can be calculated using scater. First, we need to find the effective transcript length in Kilobases. However, our dataset containes only gene IDs, therefore we will be using the gene lengths instead of transcripts. scater uses the biomaRt package, which allows one to annotate genes by other attributes: umi.qc &lt;- getBMFeatureAnnos( umi.qc, filters = &quot;ensembl_gene_id&quot;, attributes = c( &quot;ensembl_gene_id&quot;, &quot;hgnc_symbol&quot;, &quot;chromosome_name&quot;, &quot;start_position&quot;, &quot;end_position&quot; ), biomart = &quot;ENSEMBL_MART_ENSEMBL&quot;, dataset = &quot;hsapiens_gene_ensembl&quot;, host = &quot;www.ensembl.org&quot; ) # If you have mouse data, change the arguments based on this example: # getBMFeatureAnnos( # object, # filters = &quot;ensembl_transcript_id&quot;, # attributes = c( # &quot;ensembl_transcript_id&quot;, # &quot;ensembl_gene_id&quot;, # &quot;mgi_symbol&quot;, # &quot;chromosome_name&quot;, # &quot;transcript_biotype&quot;, # &quot;transcript_start&quot;, # &quot;transcript_end&quot;, # &quot;transcript_count&quot; # ), # biomart = &quot;ENSEMBL_MART_ENSEMBL&quot;, # dataset = &quot;mmusculus_gene_ensembl&quot;, # host = &quot;www.ensembl.org&quot; # ) Some of the genes were not annotated, therefore we filter them out: umi.qc.ann &lt;- umi.qc[!is.na(rowData(umi.qc)$ensembl_gene_id), ] Now we compute the total gene length in Kilobases by using the end_position and start_position fields: eff_length &lt;- abs(rowData(umi.qc.ann)$end_position - rowData(umi.qc.ann)$start_position) / 1000 plot(eff_length, rowMeans(counts(umi.qc.ann))) There is no relationship between gene length and mean expression so __FPKM__s &amp; __TPM__s are inappropriate for this dataset. This is what we would expect for UMI protocols that tag one end of the transcript. But we will demonstrate them anyway. Note Here calculate the total gene length instead of the total exon length. Many genes will contain lots of introns so their eff_length will be very different from what we have calculated. Please consider our calculation as approximation. If you want to use the total exon lengths, please refer to this page. Now we are ready to perform the normalisations: tpm(umi.qc.ann) &lt;- log2(calculateTPM(umi.qc.ann, eff_length) + 1) Plot the results as a PCA plot: tmp &lt;- runPCA( umi.qc.ann, exprs_values = &quot;tpm&quot;, ) plotPCA( tmp, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) tpm(umi.qc.ann) &lt;- log2(calculateFPKM(umi.qc.ann, eff_length) + 1) tmp &lt;- runPCA( umi.qc.ann, exprs_values = &quot;tpm&quot;, ) plotPCA( tmp, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) Note The PCA looks for differences between cells. Gene length is the same across cells for each gene thus FPKM is almost identical to the CPM plot (it is just rotated) since it performs CPM first then normalizes gene length. Whereas, TPM is different because it weights genes by their length before performing __CPM_**. 7.2.6 Reflection Q: What is your assessment of the performance of these different normalization methods on the data presented here? Q: Which normalization method would you prefer for this dataset? Why? 7.2.7 Exercise Perform the same analysis with read counts of the tung data. Use tung/reads.rds file to load the reads SCE object. Once you have finished please compare your results to ours (next chapter). 7.2.8 sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] scran_1.12.1 scater_1.12.2 ## [3] ggplot2_3.2.1 SingleCellExperiment_1.6.0 ## [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 ## [7] BiocParallel_1.18.1 matrixStats_0.55.0 ## [9] Biobase_2.44.0 GenomicRanges_1.36.1 ## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 ## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 ## [15] scRNA.seq.funcs_0.1.0 ## ## loaded via a namespace (and not attached): ## [1] viridis_0.5.1 dynamicTreeCut_1.63-1 ## [3] edgeR_3.26.8 BiocSingular_1.0.0 ## [5] viridisLite_0.3.0 DelayedMatrixStats_1.6.1 ## [7] elliptic_1.4-0 moments_0.14 ## [9] assertthat_0.2.1 statmod_1.4.32 ## [11] highr_0.8 dqrng_0.2.1 ## [13] GenomeInfoDbData_1.2.1 vipor_0.4.5 ## [15] yaml_2.2.0 globals_0.12.4 ## [17] pillar_1.4.2 lattice_0.20-38 ## [19] glue_1.3.1 limma_3.40.6 ## [21] digest_0.6.21 XVector_0.24.0 ## [23] colorspace_1.4-1 plyr_1.8.4 ## [25] cowplot_1.0.0 htmltools_0.3.6 ## [27] Matrix_1.2-17 pkgconfig_2.0.3 ## [29] listenv_0.7.0 bookdown_0.13 ## [31] zlibbioc_1.30.0 purrr_0.3.2 ## [33] scales_1.0.0 Rtsne_0.15 ## [35] tibble_2.1.3 withr_2.1.2 ## [37] lazyeval_0.2.2 magrittr_1.5 ## [39] crayon_1.3.4 evaluate_0.14 ## [41] future_1.14.0 MASS_7.3-51.1 ## [43] beeswarm_0.2.3 tools_3.6.0 ## [45] stringr_1.4.0 locfit_1.5-9.1 ## [47] munsell_0.5.0 irlba_2.3.3 ## [49] orthopolynom_1.0-5 compiler_3.6.0 ## [51] rsvd_1.0.2 contfrac_1.1-12 ## [53] rlang_0.4.0 grid_3.6.0 ## [55] RCurl_1.95-4.12 BiocNeighbors_1.2.0 ## [57] igraph_1.2.4.1 labeling_0.3 ## [59] bitops_1.0-6 rmarkdown_1.15 ## [61] codetools_0.2-16 hypergeo_1.2-13 ## [63] gtable_0.3.0 deSolve_1.24 ## [65] reshape2_1.4.3 R6_2.4.0 ## [67] gridExtra_2.3 knitr_1.25 ## [69] dplyr_0.8.3 future.apply_1.3.0 ## [71] stringi_1.4.3 ggbeeswarm_0.6.0 ## [73] Rcpp_1.0.2 sctransform_0.2.0 ## [75] tidyselect_0.2.5 xfun_0.9 7.3 Normalization practice (Reads) Figure 7.9: PCA plot of the tung data Figure 7.10: PCA plot of the tung data after CPM normalisation Figure 7.11: Cell-wise RLE of the tung data Figure 7.12: Cell-wise RLE of the tung data Figure 7.13: PCA plot of the tung data after LSF normalisation Figure 7.14: Cell-wise RLE of the tung data Figure 7.15: Cell-wise RLE of the tung data Let us look at the NB GLM model parameters estimated by sctransform. We can look at the effect of sctransform’s normalization on three particular genes, ACTB, POU5F1 (aka OCT4) and CD74. Figure 7.16: PCA plot of the tung reads data after sctransform normalisation (Pearson residuals). Figure 7.17: Cell-wise RLE of the tung reads data ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] scran_1.12.1 scater_1.12.2 ## [3] ggplot2_3.2.1 SingleCellExperiment_1.6.0 ## [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 ## [7] BiocParallel_1.18.1 matrixStats_0.55.0 ## [9] Biobase_2.44.0 GenomicRanges_1.36.1 ## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 ## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 ## [15] scRNA.seq.funcs_0.1.0 ## ## loaded via a namespace (and not attached): ## [1] viridis_0.5.1 dynamicTreeCut_1.63-1 ## [3] edgeR_3.26.8 BiocSingular_1.0.0 ## [5] viridisLite_0.3.0 DelayedMatrixStats_1.6.1 ## [7] elliptic_1.4-0 moments_0.14 ## [9] assertthat_0.2.1 statmod_1.4.32 ## [11] highr_0.8 dqrng_0.2.1 ## [13] GenomeInfoDbData_1.2.1 vipor_0.4.5 ## [15] yaml_2.2.0 globals_0.12.4 ## [17] pillar_1.4.2 lattice_0.20-38 ## [19] glue_1.3.1 limma_3.40.6 ## [21] digest_0.6.21 XVector_0.24.0 ## [23] colorspace_1.4-1 plyr_1.8.4 ## [25] cowplot_1.0.0 htmltools_0.3.6 ## [27] Matrix_1.2-17 pkgconfig_2.0.3 ## [29] listenv_0.7.0 bookdown_0.13 ## [31] zlibbioc_1.30.0 purrr_0.3.2 ## [33] scales_1.0.0 Rtsne_0.15 ## [35] tibble_2.1.3 withr_2.1.2 ## [37] lazyeval_0.2.2 magrittr_1.5 ## [39] crayon_1.3.4 evaluate_0.14 ## [41] future_1.14.0 MASS_7.3-51.1 ## [43] beeswarm_0.2.3 tools_3.6.0 ## [45] stringr_1.4.0 munsell_0.5.0 ## [47] locfit_1.5-9.1 irlba_2.3.3 ## [49] orthopolynom_1.0-5 compiler_3.6.0 ## [51] rsvd_1.0.2 contfrac_1.1-12 ## [53] rlang_0.4.0 grid_3.6.0 ## [55] RCurl_1.95-4.12 BiocNeighbors_1.2.0 ## [57] igraph_1.2.4.1 labeling_0.3 ## [59] bitops_1.0-6 rmarkdown_1.15 ## [61] codetools_0.2-16 hypergeo_1.2-13 ## [63] gtable_0.3.0 deSolve_1.24 ## [65] reshape2_1.4.3 R6_2.4.0 ## [67] gridExtra_2.3 knitr_1.25 ## [69] dplyr_0.8.3 future.apply_1.3.0 ## [71] stringi_1.4.3 ggbeeswarm_0.6.0 ## [73] Rcpp_1.0.2 sctransform_0.2.0 ## [75] tidyselect_0.2.5 xfun_0.9 7.4 Identifying confounding factors 7.4.1 Introduction There is a large number of potential confounders, artifacts and biases in scRNA-seq data. One of the main challenges in analysing scRNA-seq data stems from the fact that it is difficult to carry out a true technical replication (why?) to distinguish biological and technical variability. In the previous chapters we considered normalization and in this chapter we will continue to explore how experimental artifacts can be identified and removed. We will continue using the scater package since it provides a set of methods specifically for quality control of experimental and explanatory variables. Moreover, we will continue to work with the Blischak data that was used in the previous chapter. library(scater, quietly = TRUE) library(scran) options(stringsAsFactors = FALSE) umi &lt;- readRDS(&quot;data/tung/umi.rds&quot;) umi.qc &lt;- umi[rowData(umi)$use, colData(umi)$use] endog_genes &lt;- !rowData(umi.qc)$is_feature_control The umi.qc dataset contains filtered cells and genes. Our next step is to explore technical drivers of variability in the data to inform data normalisation before downstream analysis. 7.4.2 Correlations with PCs Let’s first look again at the PCA plot of the QCed dataset using the scran-normalized log2-CPM values: qclust &lt;- quickCluster(umi.qc, min.size = 30, use.ranks = FALSE) umi.qc &lt;- computeSumFactors(umi.qc, sizes = 15, clusters = qclust) umi.qc &lt;- normalize(umi.qc) reducedDim(umi.qc, &quot;PCA&quot;) &lt;- reducedDim( runPCA(umi.qc[endog_genes,], exprs_values = &quot;logcounts&quot;, ncomponents = 10), &quot;PCA&quot;) plotPCA( umi.qc, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot; ) Figure 7.18: PCA plot of the tung data scater allows one to identify principal components that correlate with experimental and QC variables of interest (it ranks principle components by \\(R^2\\) from a linear model regressing PC value against the variable of interest). Let’s test whether some of the variables correlate with any of the PCs. 7.4.2.1 Top colData variables associated with PCs The plot below shows, for each of the first 10 PCs, the variance explained by the ten variables in colData(umi.qc) that are most strongly associated with the PCs. [We will ignore the sample_id variable: it has a unique value for each cell, so can explain all the variation for all PCs.] plotExplanatoryPCs(umi.qc) Figure 7.19: PC correlation with the number of detected genes Indeed, we can see that PC1 can be almost completely explained by batch and individual (of course batch is nested within individual). The total counts from ERCC spike-ins also explains a substantial proportion of the variability in PC1. Although number of detected genes is not strongly correlated with the PCs here (after normalization), this is commonly the case and something to look out for. [You might like to replicate the plot above using raw logcounts values to see what happens without normalization]. This is a well-known issue in scRNA-seq and was described here. 7.4.3 Explanatory variables scater can also compute the marginal \\(R^2\\) for each variable when fitting a linear model regressing expression values for each gene against just that variable, and display a density plot of the gene-wise marginal \\(R^2\\) values for the variables. plotExplanatoryVariables( umi.qc, exprs_values = &quot;logcounts_raw&quot;, variables = c( &quot;total_features_by_counts&quot;, &quot;total_counts&quot;, &quot;batch&quot;, &quot;individual&quot;, &quot;pct_counts_ERCC&quot;, &quot;pct_counts_MT&quot; ) ) Figure 7.20: Explanatory variables This analysis indicates that the number of detected genes (again) and also the sequencing depth (total number of UMI counts per cell) have substantial explanatory power for many genes, so these variables are good candidates for conditioning out in a normalization step, or including in downstream statistical models [cf. sctransform’s approach to normalization]. Expression of ERCCs also appears to be an important explanatory variable and one notable feature of the above plot is that batch explains more than individual. What does that tell us about the technical and biological variability of the data? 7.4.4 Other confounders In addition to correcting for batch, there are other factors that one may want to compensate for. As with batch correction, these adjustments require extrinsic information. One popular method is scLVM which allows you to identify and subtract the effect from processes such as cell-cycle or apoptosis. In addition, protocols may differ in terms of their coverage of each transcript, their bias based on the average content of A/T nucleotides, or their ability to capture short transcripts. Ideally, we would like to compensate for all of these differences and biases. 7.4.5 Exercise Perform the same analysis with read counts of the Blischak data. Use tung/reads.rds file to load the reads SCESet object. Once you have finished please compare your results to ours (next chapter). 7.4.6 sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] scran_1.12.1 scater_1.12.2 ## [3] ggplot2_3.2.1 SingleCellExperiment_1.6.0 ## [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 ## [7] BiocParallel_1.18.1 matrixStats_0.55.0 ## [9] Biobase_2.44.0 GenomicRanges_1.36.1 ## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 ## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 ## [15] knitr_1.25 ## ## loaded via a namespace (and not attached): ## [1] locfit_1.5-9.1 Rcpp_1.0.2 ## [3] rsvd_1.0.2 lattice_0.20-38 ## [5] assertthat_0.2.1 digest_0.6.21 ## [7] R6_2.4.0 dynamicTreeCut_1.63-1 ## [9] evaluate_0.14 highr_0.8 ## [11] pillar_1.4.2 zlibbioc_1.30.0 ## [13] rlang_0.4.0 lazyeval_0.2.2 ## [15] irlba_2.3.3 Matrix_1.2-17 ## [17] rmarkdown_1.15 labeling_0.3 ## [19] BiocNeighbors_1.2.0 statmod_1.4.32 ## [21] stringr_1.4.0 igraph_1.2.4.1 ## [23] RCurl_1.95-4.12 munsell_0.5.0 ## [25] compiler_3.6.0 vipor_0.4.5 ## [27] BiocSingular_1.0.0 xfun_0.9 ## [29] pkgconfig_2.0.3 ggbeeswarm_0.6.0 ## [31] htmltools_0.3.6 tidyselect_0.2.5 ## [33] tibble_2.1.3 gridExtra_2.3 ## [35] GenomeInfoDbData_1.2.1 bookdown_0.13 ## [37] edgeR_3.26.8 viridisLite_0.3.0 ## [39] crayon_1.3.4 dplyr_0.8.3 ## [41] withr_2.1.2 bitops_1.0-6 ## [43] grid_3.6.0 gtable_0.3.0 ## [45] magrittr_1.5 scales_1.0.0 ## [47] dqrng_0.2.1 stringi_1.4.3 ## [49] XVector_0.24.0 viridis_0.5.1 ## [51] limma_3.40.6 DelayedMatrixStats_1.6.1 ## [53] cowplot_1.0.0 tools_3.6.0 ## [55] glue_1.3.1 beeswarm_0.2.3 ## [57] purrr_0.3.2 yaml_2.2.0 ## [59] colorspace_1.4-1 7.5 Identifying confounding factors (Reads) Figure 7.21: PCA plot of the tung data Figure 7.22: PC correlation with the number of detected genes Figure 7.23: Explanatory variables ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] scran_1.12.1 scater_1.12.2 ## [3] ggplot2_3.2.1 SingleCellExperiment_1.6.0 ## [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 ## [7] BiocParallel_1.18.1 matrixStats_0.55.0 ## [9] Biobase_2.44.0 GenomicRanges_1.36.1 ## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 ## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 ## ## loaded via a namespace (and not attached): ## [1] locfit_1.5-9.1 Rcpp_1.0.2 ## [3] rsvd_1.0.2 lattice_0.20-38 ## [5] assertthat_0.2.1 digest_0.6.21 ## [7] R6_2.4.0 dynamicTreeCut_1.63-1 ## [9] evaluate_0.14 highr_0.8 ## [11] pillar_1.4.2 zlibbioc_1.30.0 ## [13] rlang_0.4.0 lazyeval_0.2.2 ## [15] irlba_2.3.3 Matrix_1.2-17 ## [17] rmarkdown_1.15 labeling_0.3 ## [19] BiocNeighbors_1.2.0 statmod_1.4.32 ## [21] stringr_1.4.0 igraph_1.2.4.1 ## [23] RCurl_1.95-4.12 munsell_0.5.0 ## [25] compiler_3.6.0 vipor_0.4.5 ## [27] BiocSingular_1.0.0 xfun_0.9 ## [29] pkgconfig_2.0.3 ggbeeswarm_0.6.0 ## [31] htmltools_0.3.6 tidyselect_0.2.5 ## [33] tibble_2.1.3 gridExtra_2.3 ## [35] GenomeInfoDbData_1.2.1 bookdown_0.13 ## [37] edgeR_3.26.8 viridisLite_0.3.0 ## [39] crayon_1.3.4 dplyr_0.8.3 ## [41] withr_2.1.2 bitops_1.0-6 ## [43] grid_3.6.0 gtable_0.3.0 ## [45] magrittr_1.5 scales_1.0.0 ## [47] dqrng_0.2.1 stringi_1.4.3 ## [49] XVector_0.24.0 viridis_0.5.1 ## [51] limma_3.40.6 DelayedMatrixStats_1.6.1 ## [53] cowplot_1.0.0 tools_3.6.0 ## [55] glue_1.3.1 beeswarm_0.2.3 ## [57] purrr_0.3.2 yaml_2.2.0 ## [59] colorspace_1.4-1 knitr_1.25 7.6 Batch effects 7.6.1 Introduction In the previous chapter we normalized for library size, effectively removing it as a confounder. Now we will consider removing other less well defined confounders from our data. Technical confounders (aka batch effects) can arise from difference in reagents, isolation methods, the lab/experimenter who performed the experiment, even which day/time the experiment was performed. Accounting for technical confounders, and batch effects particularly, is a large topic that also involves principles of experimental design. Here we address approaches that can be taken to account for confounders when the experimental design is appropriate. Fundamentally, accounting for technical confounders involves identifying and, ideally, removing sources of variation in the expression data that are not related to (i.e. are confounding) the biological signal of interest. Various approaches exist, some of which use spike-in or housekeeping genes, and some of which use endogenous genes. 7.6.1.1 Advantages and disadvantages of using spike-ins to remove confounders The use of spike-ins as control genes is conceptually appealing, since (ideally) the same amount of ERCC (or other) spike-in would be added to each cell in our experiment. In principle, all the variability we observe for these ``genes’’ is due to technical noise; whereas endogenous genes are affected by both technical noise and biological variability. Technical noise can be removed by fitting a model to the spike-ins and “substracting” this from the endogenous genes. There are several methods available based on this premise (eg. BASiCS, scLVM, RUVg); each using different noise models and different fitting procedures. Alternatively, one can identify genes which exhibit significant variation beyond technical noise (eg. Distance to median, Highly variable genes). Unfortunately, there are major issues with the use of spike-ins for normalisation that limit their utility in practice. Perhaps surprisingly, their variability can, for various reasons, actually be higher than that of endogenous genes. One key reason for the difficulty of their use in practice is the need to pipette miniscule volumes of spike-in solution into The most popular set of spike-ins, namely ERCCs, are derived from bacterial sequences, which raises concerns that their base content and structure diverges to far from gene structure in other biological systems of interest (e.g. mammalian genes) to be reliable for normalisation. Even in the best-case scenarios, spike-ins are limited to use on plate-based platforms; they are fundamentally incompatible with droplet-based platforms. Given the issues with using spike-ins, better results can often be obtained by using endogenous genes instead. Given their limited availability, normalisation methods based only on endogenous genes needed to be developed and we consider them generally preferable, even for platforms where spike-ins may be used. Where we have a large number of endogenous genes that, on average, do not vary systematically between cells and where we expect technical effects to affect a large number of genes (a very common and reasonable assumption), then such methods (for example, the RUVs method) can perform well. We explore both general approaches below. library(scRNA.seq.funcs) library(RUVSeq) library(scater) library(SingleCellExperiment) library(scran) library(kBET) library(sva) # Combat library(edgeR) library(harmony) set.seed(1234567) options(stringsAsFactors = FALSE) umi &lt;- readRDS(&quot;data/tung/umi.rds&quot;) umi.qc &lt;- umi[rowData(umi)$use, colData(umi)$use] endog_genes &lt;- !rowData(umi.qc)$is_feature_control erccs &lt;- rowData(umi.qc)$is_feature_control ## Apply scran sum factor normalization qclust &lt;- quickCluster(umi.qc, min.size = 30, use.ranks = FALSE) umi.qc &lt;- computeSumFactors(umi.qc, sizes = 15, clusters = qclust) umi.qc &lt;- normalize(umi.qc) 7.6.2 Linear models Linear models offer a relatively simple approach to accounting for batch effects and confounders. A linear model can correct for batches while preserving biological effects if you have a balanced design. In a confounded/replicate design biological effects will not be fit/preserved. We could remove batch effects from each individual separately in order to preserve biological (and technical) variance between individuals (we will apply a similar with mnnCorrect, below). Depending on how we have pre-processed our scRNA-seq data or what modelling assumptions we are willing to make, we may choose to use normal (Gaussian) linear models (i.e. assuming a normal distribution for noise) or generalized linear models (GLM), where we can use any distribution from the exponential family. Given that we obtain highly-variable count data from scRNA-seq assays, the obvious choice for a GLM is to use the negative binomial distribution, which has proven highly successful in the analysis of bulk RNA-seq data. For demonstration purposes here we will naively correct all confounded batch effects. 7.6.2.1 Gaussian (normal) linear models The limma package in Bioconductor offers a convenient and efficient means to fit a linear model (with the same design matrix) to a dataset with a large number of features (i.e. genes) (Ritchie et al. 2015). An added advantage of limma is its ability to apply empirical Bayes squeezing of variance estimate to improve inference. Provided we are satisfied making the assumption of a Gaussian distribution for residuals (this may be reasonable for normalized log-counts in many cases; but it may not be—debate continues in the literature), then we can apply limma to regress out (known) unwanted sources of variation as follows. ## fit a model just accounting for batch lm_design_batch &lt;- model.matrix(~0 + batch, data = colData(umi.qc)) fit_lm_batch &lt;- lmFit(logcounts(umi.qc), lm_design_batch) resids_lm_batch &lt;- residuals(fit_lm_batch, logcounts(umi.qc)) assay(umi.qc, &quot;lm_batch&quot;) &lt;- resids_lm_batch reducedDim(umi.qc, &quot;PCA_lm_batch&quot;) &lt;- reducedDim( runPCA(umi.qc[endog_genes, ], exprs_values = &quot;lm_batch&quot;), &quot;PCA&quot;) plotReducedDim(umi.qc, use_dimred = &quot;PCA_lm_batch&quot;, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) + ggtitle(&quot;LM - regress out batch&quot;) Two problems are immediately apparent with the approach above. First, batch is nested within individual, so simply regressing out batch as we have done above also regresses out differences between individuals that we would like to preserve. Second, we observe that the first principal component seems to separate cells by number of genes (features) expressed, which is undesirable. We can address these concerns by correcting for batch within each individual separately, and also fitting the proportion of genes expressed per cell as a covariate. [NB: to preserve overall differences in expression levels between individuals we will need to apply a slight hack to the LM fit results (setting the intercept coefficient to zero).] Exercise 2 Perform LM correction for each individual separately. Store the final corrected matrix in the lm_batch_indi slot. ## define cellular detection rate (cdr), i.e. proportion of genes expressed in each cell umi.qc$cdr &lt;- umi.qc$total_features_by_counts_endogenous / nrow(umi.qc) ## fit a model just accounting for batch by individual lm_design_batch1 &lt;- model.matrix(~batch + cdr, data = colData(umi.qc)[umi.qc$individual == &quot;na19098&quot;,]) fit_indi1 &lt;- lmfit(logcounts(umi.qc)[, umi.qc$individual == &quot;na19098&quot;], lm_design_batch1) fit_indi1$coefficients[,1] &lt;- 0 ## replace intercept with 0 to preserve reference batch resids_lm_batch1 &lt;- residuals(fit_indi1, logcounts(umi.qc)[, umi.qc$individual == &quot;na19098&quot;]) lm_design_batch2 &lt;- model.matrix(~batch + cdr, data = colData(umi.qc)[umi.qc$individual == &quot;na19101&quot;,]) fit_indi2 &lt;- lmfit(logcounts(umi.qc)[, umi.qc$individual == &quot;na19101&quot;], lm_design_batch2) fit_indi2$coefficients[,1] &lt;- 0 ## replace intercept with 0 to preserve reference batch resids_lm_batch2 &lt;- residuals(fit_indi2, logcounts(umi.qc)[, umi.qc$individual == &quot;na19101&quot;]) lm_design_batch3 &lt;- model.matrix(~batch + cdr, data = colData(umi.qc)[umi.qc$individual == &quot;na19239&quot;,]) fit_indi3 &lt;- lmfit(logcounts(umi.qc)[, umi.qc$individual == &quot;na19239&quot;], lm_design_batch3) fit_indi3$coefficients[,1] &lt;- 0 ## replace intercept with 0 to preserve reference batch resids_lm_batch3 &lt;- residuals(fit_indi3, logcounts(umi.qc)[, umi.qc$individual == &quot;na19239&quot;]) identical(colnames(umi.qc), colnames(cbind(resids_lm_batch1, resids_lm_batch2, resids_lm_batch3))) assay(umi.qc, &quot;lm_batch_indi&quot;) &lt;- cbind(resids_lm_batch1, resids_lm_batch2, resids_lm_batch3) reduceddim(umi.qc, &quot;pca_lm_batch_indi&quot;) &lt;- reduceddim( runpca(umi.qc[endog_genes, ], exprs_values = &quot;lm_batch_indi&quot;), &quot;pca&quot;) plotreduceddim(umi.qc, use_dimred = &quot;pca_lm_batch_indi&quot;, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) + ggtitle(&quot;lm - regress out batch within individuals separately&quot;) What do you think of the results of this approach? 7.6.2.2 Negative binomial generalized linear models Advanced exercise Can you use the edgeR package to use a negative binomial generalized linear model to regress out batch effects? Hint: follow a similar approach to that taken in the limma example above. You will need to use the DGEList(), estimateDisp(), and glmQLFit() functions. 7.6.3 sctransform The sctransform approach to using Pearson residuals from an regularized negative binomial generalized linear model was introduced above. Here we demonstrate how to apply this method. Note that (due to what looks like a bug in this version of sctransform) we need to convert the UMI count matrix to a sparse format to apply sctransform. These sctransform results will face the problem mentioned above of batch being nested within individual, which means that we cannot directly remove batch effects without removing differences between individuals. However, here we will demonstrate how you would try to remove batch effects with sctransform for a kinder experimental design. umi_sparse &lt;- as(counts(umi.qc), &quot;dgCMatrix&quot;) ### Genes expressed in at least 5 cells will be kept sctnorm_data &lt;- sctransform::vst(umi = umi_sparse, min_cells = 1, cell_attr = as.data.frame(colData(umi.qc)), latent_var = c(&quot;log10_total_counts_endogenous&quot;, &quot;batch&quot;)) ## Pearson residuals, or deviance residuals sctnorm_data$model_str assay(umi.qc, &quot;sctrans_norm&quot;) &lt;- sctnorm_data$y Let us look at the NB GLM model parameters estimated by sctransform. #sce$log10_total_counts ## Matrix of estimated model parameters per gene (theta and regression coefficients) sctransform::plot_model_pars(sctnorm_data) Do these parameters and the regularization look sensible to you? Any concerns? reducedDim(umi.qc, &quot;PCA_sctrans_norm&quot;) &lt;- reducedDim( runPCA(umi.qc[endog_genes, ], exprs_values = &quot;sctrans_norm&quot;) ) plotReducedDim( umi.qc, use_dimred = &quot;PCA_sctrans_norm&quot;, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) + ggtitle(&quot;PCA plot: sctransform normalization&quot;) Figure 7.7: PCA plot of the tung data after sctransform normalisation (Pearson residuals). Q: What’s happened here? Was that expected? Any other comments? 7.6.4 Remove Unwanted Variation Factors contributing to technical noise frequently appear as “batch effects” where cells processed on different days or by different technicians systematically vary from one another. Removing technical noise and correcting for batch effects can frequently be performed using the same tool or slight variants on it. We will be considering the Remove Unwanted Variation (RUVSeq). Briefly, RUVSeq works as follows. For \\(n\\) samples and \\(J\\) genes, consider the following generalized linear model (GLM), where the RNA-Seq read counts are regressed on both the known covariates of interest and unknown factors of unwanted variation: \\[\\log E[Y|W,X,O] = W\\alpha + X\\beta + O\\] Here, \\(Y\\) is the \\(n \\times J\\) matrix of observed gene-level read counts, \\(W\\) is an \\(n \\times k\\) matrix corresponding to the factors of “unwanted variation” and \\(O\\) is an \\(n \\times J\\) matrix of offsets that can either be set to zero or estimated with some other normalization procedure (such as upper-quartile normalization). The simultaneous estimation of \\(W\\), \\(\\alpha\\), \\(\\beta\\), and \\(k\\) is infeasible. For a given \\(k\\), instead the following three approaches to estimate the factors of unwanted variation \\(W\\) are used: RUVg uses negative control genes (e.g. ERCCs), assumed to have constant expression across samples; RUVs uses centered (technical) replicate/negative control samples for which the covariates of interest are constant; RUVr uses residuals, e.g., from a first-pass GLM regression of the counts on the covariates of interest. We will concentrate on the first two approaches. 7.6.4.1 RUVg To use RUVg we will use ERCCs as negative control genes to anchor the estimation of factors representing unwanted variation. RUVg operates on the raw count data. We adjust the output normalized counts from RUVg so that they represent normalized counts-per-million and then apply a log2 transformation. We run RUVg twice, with \\(k=1\\) and \\(k=10\\) so that we can compare the effect of estimating different number of hidden factors to capture unwanted variation in the data. ruvg &lt;- RUVg(counts(umi.qc), erccs, k = 1) assay(umi.qc, &quot;ruvg1&quot;) &lt;- log2( t(t(ruvg$normalizedCounts) / colSums(ruvg$normalizedCounts) * 1e6) + 1 ) ruvg &lt;- RUVg(counts(umi.qc), erccs, k = 10) assay(umi.qc, &quot;ruvg10&quot;) &lt;- log2( t(t(ruvg$normalizedCounts) / colSums(ruvg$normalizedCounts) * 1e6) + 1 ) When we assess the effectiveness of various batch correction methods below, you can discuss whether or not you think using ERCCs as negative control genes for a method like RUVg is advisable (in this dataset and in general). 7.6.4.2 RUVs In this application of RUVs we treat the individuals as replicates for which the covariates of interest are constant. As above, we adjust the output normalized counts from RUVs so that they represent normalized counts-per-million and then apply a log2 transformation. Again, we run the method with \\(k=1\\) and \\(k=10\\) so that we can compare the effect of estimating different number of hidden factors. scIdx &lt;- matrix(-1, ncol = max(table(umi.qc$individual)), nrow = 3) tmp &lt;- which(umi.qc$individual == &quot;NA19098&quot;) scIdx[1, 1:length(tmp)] &lt;- tmp tmp &lt;- which(umi.qc$individual == &quot;NA19101&quot;) scIdx[2, 1:length(tmp)] &lt;- tmp tmp &lt;- which(umi.qc$individual == &quot;NA19239&quot;) scIdx[3, 1:length(tmp)] &lt;- tmp cIdx &lt;- rownames(umi.qc) ruvs &lt;- RUVs(counts(umi.qc), cIdx, k = 1, scIdx = scIdx, isLog = FALSE) assay(umi.qc, &quot;ruvs1&quot;) &lt;- log2( t(t(ruvs$normalizedCounts) / colSums(ruvs$normalizedCounts) * 1e6) + 1 ) ruvs &lt;- RUVs(counts(umi.qc), cIdx, k = 10, scIdx = scIdx, isLog = FALSE) assay(umi.qc, &quot;ruvs10&quot;) &lt;- log2( t(t(ruvs$normalizedCounts) / colSums(ruvs$normalizedCounts) * 1e6) + 1 ) 7.6.5 Combat If you have an experiment with a balanced design, Combat can be used to eliminate batch effects while preserving biological effects by specifying the biological effects using the mod parameter. However the Tung data contains multiple experimental replicates rather than a balanced design so using mod1 to preserve biological variability will result in an error. combat_data &lt;- logcounts(umi.qc) mod_data &lt;- as.data.frame(t(combat_data)) # Basic batch removal mod0 &lt;- model.matrix(~ 1, data = mod_data) # Preserve biological variability mod1 &lt;- model.matrix(~ umi.qc$individual, data = mod_data) # adjust for total genes detected mod2 &lt;- model.matrix(~ umi.qc$total_features_by_counts, data = mod_data) assay(umi.qc, &quot;combat&quot;) &lt;- ComBat( dat = t(mod_data), batch = factor(umi.qc$batch), mod = mod0, par.prior = TRUE, prior.plots = FALSE ) ## Standardizing Data across genes Exercise 1 Perform ComBat correction accounting for total features as a co-variate. Store the corrected matrix in the combat_tf slot. 7.6.6 mnnCorrect mnnCorrect (Haghverdi et al. 2017) assumes that each batch shares at least one biological condition with each other batch. Thus it works well for a variety of balanced experimental designs. However, the Tung data contains multiple replicates for each invidividual rather than balanced batches, thus we will normalize each individual separately. Note that this will remove batch effects between batches within the same individual but not the batch effects between batches in different individuals, due to the confounded experimental design. Thus we will merge a replicate from each individual to form three batches. do_mnn &lt;- function(data.qc) { batch1 &lt;- logcounts(data.qc[, data.qc$replicate == &quot;r1&quot;]) batch2 &lt;- logcounts(data.qc[, data.qc$replicate == &quot;r2&quot;]) batch3 &lt;- logcounts(data.qc[, data.qc$replicate == &quot;r3&quot;]) if (ncol(batch2) &gt; 0) { x &lt;- batchelor::mnnCorrect( batch1, batch2, batch3, k = 20, sigma = 0.1, cos.norm.in = TRUE, svd.dim = 2 ) return(x) } else { x &lt;- batchelor::mnnCorrect( batch1, batch3, k = 20, sigma = 0.1, cos.norm.in = TRUE, svd.dim = 2 ) return(x) } } indi1 &lt;- do_mnn(umi.qc[, umi.qc$individual == &quot;NA19098&quot;]) indi2 &lt;- do_mnn(umi.qc[, umi.qc$individual == &quot;NA19101&quot;]) indi3 &lt;- do_mnn(umi.qc[, umi.qc$individual == &quot;NA19239&quot;]) identical(colnames(umi.qc), colnames(cbind(indi1, indi2, indi3))) ## [1] TRUE assay(umi.qc, &quot;mnn&quot;) &lt;- assay(cbind(indi1, indi2, indi3), &quot;corrected&quot;) # For a balanced design: #assay(umi.qc, &quot;mnn&quot;) &lt;- mnnCorrect( # list(B1 = logcounts(batch1), B2 = logcounts(batch2), B3 = logcounts(batch3)), # k = 20, # sigma = 0.1, # cos.norm = TRUE, # svd.dim = 2 #) The latest version of the batchelor package has a new fastMNN() method. The fastMNN() function performs a principal components (PCA). MNN identification and correction is preformed on this low-dimensional representation of the data, an approach that offers some advantages in speed and denoising. The function returns a SingleCellExperiment object containing a matrix of corrected PC scores, which can be used directly for downstream analyses like clustering and visualization. [NB: fastMNN may actually be slower on small datasets like that considered here.] indi1 &lt;- batchelor::fastMNN( umi.qc[, umi.qc$individual == &quot;NA19098&quot;], batch = umi.qc[, umi.qc$individual == &quot;NA19098&quot;]$replicate) indi2 &lt;- batchelor::fastMNN( umi.qc[, umi.qc$individual == &quot;NA19101&quot;], batch = umi.qc[, umi.qc$individual == &quot;NA19101&quot;]$replicate) indi3 &lt;- batchelor::fastMNN( umi.qc[, umi.qc$individual == &quot;NA19239&quot;], batch = umi.qc[, umi.qc$individual == &quot;NA19239&quot;]$replicate) identical(colnames(umi.qc), colnames(cbind(assay(indi1, &quot;reconstructed&quot;), assay(indi2, &quot;reconstructed&quot;), assay(indi3, &quot;reconstructed&quot;)))) ## [1] TRUE fastmnn &lt;- cbind(assay(indi1, &quot;reconstructed&quot;), assay(indi2, &quot;reconstructed&quot;), assay(indi3, &quot;reconstructed&quot;)) identical(rownames(umi.qc), rownames(fastmnn)) ## [1] FALSE ## fastMNN() drops 66 genes, so we cannot immediately add the reconstructed expression matrix to assays() in umi.qc ## But we can run PCA on the reconstructed data from fastMNN() and add that to the reducedDim slot of our SCE object fastmnn_pca &lt;- runPCA(fastmnn, rank=2) reducedDim(umi.qc, &quot;fastmnn&quot;) &lt;- fastmnn_pca$rotation For further details, please consult the batchelor package documentation and vignette. 7.6.7 Harmony Harmony [Korsunsky2018fast] is a newer batch correction method, which is designed to operate on PC space. The algorithm proceeds to iteratively cluster the cells, with the objective function formulated to promote cells from multiple datasets within each cluster. Once a clustering is obtained, the positions of the centroids of each dataset are obtained on a per-cluster basis and the coordinates are corrected. This procedure is iterated until convergence. Harmony comes with a theta parameter that controls the degree of batch correction (higher values lead to more dataset integration), and can account for multiple experimental and biological factors on input. Seeing how the end result of Harmony is an altered dimensional reduction space created on the basis of PCA, we plot the obtained manifold here and exclude it from the rest of the follow-ups in the section. umi.qc.endog &lt;- umi.qc[endog_genes,] umi.qc.endog &lt;- runPCA(umi.qc.endog, exprs_values = &#39;logcounts&#39;, ncomponents = 20) pca &lt;- as.matrix(reducedDim(umi.qc.endog, &quot;PCA&quot;)) harmony_emb &lt;- HarmonyMatrix(pca, umi.qc.endog$batch, theta=2, do_pca=FALSE) reducedDim(umi.qc.endog, &quot;harmony&quot;) &lt;- harmony_emb plotReducedDim( umi.qc.endog, use_dimred = &#39;harmony&#39;, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) reducedDim(umi.qc, &quot;harmony&quot;) &lt;- reducedDim(umi.qc.endog, &quot;harmony&quot;) 7.6.8 How to evaluate and compare batch correction A key question when considering the different methods for removing confounders is how to quantitatively determine which one is the most effective. The main reason why comparisons are challenging is because it is often difficult to know what corresponds to technical counfounders and what is interesting biological variability. Here, we consider three different metrics which are all reasonable based on our knowledge of the experimental design. Depending on the biological question that you wish to address, it is important to choose a metric that allows you to evaluate the confounders that are likely to be the biggest concern for the given situation. 7.6.8.1 Effectiveness 1 We evaluate the effectiveness of the normalization by inspecting the PCA plot where colour corresponds the technical replicates and shape corresponds to different biological samples (individuals). Separation of biological samples and interspersed batches indicates that technical variation has been removed. We always use log2-cpm normalized data to match the assumptions of PCA. for (nm in assayNames(umi.qc)) { cat(nm, &quot; \\n&quot;) tmp &lt;- runPCA( umi.qc[endog_genes, ], exprs_values = nm ) reducedDim(umi.qc, paste0(&quot;PCA_&quot;, nm)) &lt;- reducedDim(tmp, &quot;PCA&quot;) } ## counts ## logcounts_raw ## logcounts ## sctrans_norm ## ruvg1 ## ruvg10 ## ruvs1 ## ruvs10 ## combat ## mnn for (nm in reducedDimNames(umi.qc)) { print( plotReducedDim( umi.qc, use_dimred = nm, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) + ggtitle(nm) ) } Exercise 3 Consider different k’s for RUV normalizations. Which gives the best results? 7.6.8.2 Effectiveness 2 We can also examine the effectiveness of correction using the relative log expression (RLE) across cells to confirm technical noise has been removed from the dataset. Note RLE only evaluates whether the number of genes higher and lower than average are equal for each cell - i.e. systemic technical effects. Random technical noise between batches may not be detected by RLE. res &lt;- list() for(n in assayNames(umi.qc)) { res[[n]] &lt;- suppressWarnings(calc_cell_RLE(assay(umi.qc, n), erccs)) } par(mar=c(6,4,1,1)) boxplot(res, las=2) 7.6.8.3 Effectiveness 3 Another method to check the efficacy of batch-effect correction is to consider the intermingling of points from different batches in local subsamples of the data. If there are no batch-effects then proportion of cells from each batch in any local region should be equal to the global proportion of cells in each batch. kBET (Buttner et al. 2017) takes kNN networks around random cells and tests the number of cells from each batch against a binomial distribution. The rejection rate of these tests indicates the severity of batch-effects still present in the data (high rejection rate = strong batch effects). kBET assumes each batch contains the same complement of biological groups, thus it can only be applied to the entire dataset if a perfectly balanced design has been used. However, kBET can also be applied to replicate-data if it is applied to each biological group separately. In the case of the Tung data, we will apply kBET to each individual independently to check for residual batch effects. However, this method will not identify residual batch-effects which are confounded with biological conditions. In addition, kBET does not determine if biological signal has been preserved. compare_kBET_results &lt;- function(sce){ indiv &lt;- unique(sce$individual) norms &lt;- assayNames(sce) # Get all normalizations results &lt;- list() for (i in indiv){ for (j in norms){ tmp &lt;- kBET( df = t(assay(sce[,sce$individual== i], j)), batch = sce$batch[sce$individual==i], heuristic = TRUE, verbose = FALSE, addTest = FALSE, plot = FALSE) results[[i]][[j]] &lt;- tmp$summary$kBET.observed[1] } } return(as.data.frame(results)) } eff_debatching &lt;- compare_kBET_results(umi.qc) require(&quot;reshape2&quot;) require(&quot;RColorBrewer&quot;) # Plot results dod &lt;- melt(as.matrix(eff_debatching), value.name = &quot;kBET&quot;) colnames(dod)[1:2] &lt;- c(&quot;Normalisation&quot;, &quot;Individual&quot;) colorset &lt;- c(&#39;gray&#39;, brewer.pal(n = 9, &quot;Oranges&quot;)) ggplot(dod, aes(Normalisation, Individual, fill=kBET)) + geom_tile() + scale_fill_gradient2( na.value = &quot;gray&quot;, low = colorset[2], mid=colorset[6], high = colorset[10], midpoint = 0.5, limit = c(0,1)) + scale_x_discrete(expand = c(0, 0)) + scale_y_discrete(expand = c(0, 0)) + theme( axis.text.x = element_text( angle = 45, vjust = 1, size = 12, hjust = 1 ) ) + ggtitle(&quot;Effect of batch regression methods per individual&quot;) Exercise 4 Why do the raw counts appear to have little batch effects? 7.6.9 Big Exercise Perform the same analysis with read counts of the tung data. Use tung/reads.rds file to load the reads SCE object. Once you have finished please compare your results to ours (next chapter). Additionally, experiment with other combinations of normalizations and compare the results. 7.6.10 sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] stats4 parallel stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] RColorBrewer_1.1-2 reshape2_1.4.3 ## [3] harmony_1.0 Rcpp_1.0.2 ## [5] sva_3.32.1 genefilter_1.66.0 ## [7] mgcv_1.8-28 nlme_3.1-139 ## [9] kBET_0.99.6 scran_1.12.1 ## [11] scater_1.12.2 ggplot2_3.2.1 ## [13] SingleCellExperiment_1.6.0 RUVSeq_1.18.0 ## [15] edgeR_3.26.8 limma_3.40.6 ## [17] EDASeq_2.18.0 ShortRead_1.42.0 ## [19] GenomicAlignments_1.20.1 SummarizedExperiment_1.14.1 ## [21] DelayedArray_0.10.0 matrixStats_0.55.0 ## [23] Rsamtools_2.0.1 GenomicRanges_1.36.1 ## [25] GenomeInfoDb_1.20.0 Biostrings_2.52.0 ## [27] XVector_0.24.0 IRanges_2.18.3 ## [29] S4Vectors_0.22.1 BiocParallel_1.18.1 ## [31] Biobase_2.44.0 BiocGenerics_0.30.0 ## [33] scRNA.seq.funcs_0.1.0 ## ## loaded via a namespace (and not attached): ## [1] backports_1.1.4 aroma.light_3.14.0 ## [3] plyr_1.8.4 igraph_1.2.4.1 ## [5] lazyeval_0.2.2 splines_3.6.0 ## [7] listenv_0.7.0 elliptic_1.4-0 ## [9] digest_0.6.21 htmltools_0.3.6 ## [11] viridis_0.5.1 magrittr_1.5 ## [13] memoise_1.1.0 contfrac_1.1-12 ## [15] cluster_2.1.0 globals_0.12.4 ## [17] annotate_1.62.0 R.utils_2.9.0 ## [19] prettyunits_1.0.2 colorspace_1.4-1 ## [21] blob_1.2.0 xfun_0.9 ## [23] dplyr_0.8.3 crayon_1.3.4 ## [25] RCurl_1.95-4.12 zeallot_0.1.0 ## [27] survival_2.43-3 glue_1.3.1 ## [29] gtable_0.3.0 zlibbioc_1.30.0 ## [31] BiocSingular_1.0.0 future.apply_1.3.0 ## [33] scales_1.0.0 DESeq_1.36.0 ## [35] DBI_1.0.0 viridisLite_0.3.0 ## [37] xtable_1.8-4 progress_1.2.2 ## [39] dqrng_0.2.1 bit_1.1-14 ## [41] rsvd_1.0.2 deSolve_1.24 ## [43] httr_1.4.1 FNN_1.1.3 ## [45] pkgconfig_2.0.3 XML_3.98-1.20 ## [47] R.methodsS3_1.7.1 locfit_1.5-9.1 ## [49] dynamicTreeCut_1.63-1 tidyselect_0.2.5 ## [51] labeling_0.3 rlang_0.4.0 ## [53] AnnotationDbi_1.46.1 munsell_0.5.0 ## [55] tools_3.6.0 moments_0.14 ## [57] RSQLite_2.1.2 batchelor_1.0.1 ## [59] evaluate_0.14 stringr_1.4.0 ## [61] yaml_2.2.0 knitr_1.25 ## [63] bit64_0.9-7 hypergeo_1.2-13 ## [65] purrr_0.3.2 future_1.14.0 ## [67] R.oo_1.22.0 biomaRt_2.40.4 ## [69] compiler_3.6.0 beeswarm_0.2.3 ## [71] tibble_2.1.3 statmod_1.4.32 ## [73] geneplotter_1.62.0 stringi_1.4.3 ## [75] highr_0.8 GenomicFeatures_1.36.4 ## [77] lattice_0.20-38 Matrix_1.2-17 ## [79] vctrs_0.2.0 lifecycle_0.1.0 ## [81] pillar_1.4.2 BiocNeighbors_1.2.0 ## [83] cowplot_1.0.0 bitops_1.0-6 ## [85] orthopolynom_1.0-5 irlba_2.3.3 ## [87] rtracklayer_1.44.4 R6_2.4.0 ## [89] latticeExtra_0.6-28 hwriter_1.3.2 ## [91] bookdown_0.13 gridExtra_2.3 ## [93] vipor_0.4.5 codetools_0.2-16 ## [95] MASS_7.3-51.1 assertthat_0.2.1 ## [97] withr_2.1.2 sctransform_0.2.0 ## [99] GenomeInfoDbData_1.2.1 hms_0.5.1 ## [101] grid_3.6.0 tidyr_1.0.0 ## [103] rmarkdown_1.15 DelayedMatrixStats_1.6.1 ## [105] Rtsne_0.15 ggbeeswarm_0.6.0 7.7 Dealing with confounders (Reads) library(scRNA.seq.funcs) library(RUVSeq) library(scater) library(SingleCellExperiment) library(scran) library(kBET) library(sva) # Combat library(harmony) library(edgeR) set.seed(1234567) options(stringsAsFactors = FALSE) reads &lt;- readRDS(&quot;data/tung/reads.rds&quot;) reads.qc &lt;- reads[rowData(reads)$use, colData(reads)$use] endog_genes &lt;- !rowData(reads.qc)$is_feature_control erccs &lt;- rowData(reads.qc)$is_feature_control qclust &lt;- quickCluster(reads.qc, min.size = 30) reads.qc &lt;- computeSumFactors(reads.qc, sizes = 15, clusters = qclust) reads.qc &lt;- normalize(reads.qc) ruvg &lt;- RUVg(counts(reads.qc), erccs, k = 1) assay(reads.qc, &quot;ruvg1&quot;) &lt;- log2( t(t(ruvg$normalizedCounts) / colSums(ruvg$normalizedCounts) * 1e6) + 1 ) ruvg &lt;- RUVg(counts(reads.qc), erccs, k = 10) assay(reads.qc, &quot;ruvg10&quot;) &lt;- log2( t(t(ruvg$normalizedCounts) / colSums(ruvg$normalizedCounts) * 1e6) + 1 ) scIdx &lt;- matrix(-1, ncol = max(table(reads.qc$individual)), nrow = 3) tmp &lt;- which(reads.qc$individual == &quot;NA19098&quot;) scIdx[1, 1:length(tmp)] &lt;- tmp tmp &lt;- which(reads.qc$individual == &quot;NA19101&quot;) scIdx[2, 1:length(tmp)] &lt;- tmp tmp &lt;- which(reads.qc$individual == &quot;NA19239&quot;) scIdx[3, 1:length(tmp)] &lt;- tmp cIdx &lt;- rownames(reads.qc) ruvs &lt;- RUVs(counts(reads.qc), cIdx, k = 1, scIdx = scIdx, isLog = FALSE) assay(reads.qc, &quot;ruvs1&quot;) &lt;- log2( t(t(ruvs$normalizedCounts) / colSums(ruvs$normalizedCounts) * 1e6) + 1 ) ruvs &lt;- RUVs(counts(reads.qc), cIdx, k = 10, scIdx = scIdx, isLog = FALSE) assay(reads.qc, &quot;ruvs10&quot;) &lt;- log2( t(t(ruvs$normalizedCounts) / colSums(ruvs$normalizedCounts) * 1e6) + 1 ) combat_data &lt;- logcounts(reads.qc) mod_data &lt;- as.data.frame(t(combat_data)) # Basic batch removal mod0 = model.matrix(~ 1, data = mod_data) # Preserve biological variability mod1 = model.matrix(~ reads.qc$individual, data = mod_data) # adjust for total genes detected mod2 = model.matrix(~ reads.qc$total_features_by_counts, data = mod_data) assay(reads.qc, &quot;combat&quot;) &lt;- ComBat( dat = t(mod_data), batch = factor(reads.qc$batch), mod = mod0, par.prior = TRUE, prior.plots = FALSE ) Exercise 1 do_mnn &lt;- function(data.qc) { batch1 &lt;- logcounts(data.qc[, data.qc$replicate == &quot;r1&quot;]) batch2 &lt;- logcounts(data.qc[, data.qc$replicate == &quot;r2&quot;]) batch3 &lt;- logcounts(data.qc[, data.qc$replicate == &quot;r3&quot;]) if (ncol(batch2) &gt; 0) { x &lt;- batchelor::mnnCorrect( batch1, batch2, batch3, k = 20, sigma = 0.1, cos.norm.in = TRUE, svd.dim = 2 ) return(x) } else { x &lt;- batchelor::mnnCorrect( batch1, batch3, k = 20, sigma = 0.1, cos.norm.in = TRUE, svd.dim = 2 ) return(x) } } indi1 &lt;- do_mnn(reads.qc[, reads.qc$individual == &quot;NA19098&quot;]) indi2 &lt;- do_mnn(reads.qc[, reads.qc$individual == &quot;NA19101&quot;]) indi3 &lt;- do_mnn(reads.qc[, reads.qc$individual == &quot;NA19239&quot;]) assay(reads.qc, &quot;mnn&quot;) &lt;- cbind(indi1, indi2, indi3) # For a balanced design: #assay(reads.qc, &quot;mnn&quot;) &lt;- mnnCorrect( # list(B1 = logcounts(batch1), B2 = logcounts(batch2), B3 = logcounts(batch3)), # k = 20, # sigma = 0.1, # cos.norm = TRUE, # svd.dim = 2 #) glm_fun &lt;- function(g, batch, indi) { model &lt;- glm(g ~ batch + indi) model$coef[1] &lt;- 0 # replace intercept with 0 to preserve reference batch. return(model$coef) } effects &lt;- apply( logcounts(reads.qc), 1, glm_fun, batch = reads.qc$batch, indi = reads.qc$individual ) corrected &lt;- logcounts(reads.qc) - t(effects[as.numeric(factor(reads.qc$batch)), ]) assay(reads.qc, &quot;glm&quot;) &lt;- corrected Exercise 2 reads.qc.endog = reads.qc[endog_genes,] reads.qc.endog = runPCA(reads.qc.endog, exprs_values = &#39;logcounts&#39;, ncomponents = 20) pca &lt;- as.matrix(reads.qc.endog@reducedDims@listData[[&quot;PCA&quot;]]) harmony_emb &lt;- HarmonyMatrix(pca, reads.qc.endog$batch, theta=2, do_pca=FALSE) reads.qc.endog@reducedDims@listData[[&#39;harmony&#39;]] &lt;- harmony_emb plotReducedDim( reads.qc.endog, use_dimred = &#39;harmony&#39;, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) for(n in assayNames(reads.qc)) { tmp &lt;- runPCA( reads.qc[endog_genes, ], exprs_values = n ) print( plotPCA( tmp, colour_by = &quot;batch&quot;, size_by = &quot;total_features_by_counts&quot;, shape_by = &quot;individual&quot; ) + ggtitle(n) ) } res &lt;- list() for(n in assayNames(reads.qc)) { res[[n]] &lt;- suppressWarnings(calc_cell_RLE(assay(reads.qc, n), erccs)) } par(mar=c(6,4,1,1)) boxplot(res, las=2) compare_kBET_results &lt;- function(sce){ indiv &lt;- unique(sce$individual) norms &lt;- assayNames(sce) # Get all normalizations results &lt;- list() for (i in indiv){ for (j in norms){ tmp &lt;- kBET( df = t(assay(sce[,sce$individual== i], j)), batch = sce$batch[sce$individual==i], heuristic = TRUE, verbose = FALSE, addTest = FALSE, plot = FALSE) results[[i]][[j]] &lt;- tmp$summary$kBET.observed[1] } } return(as.data.frame(results)) } eff_debatching &lt;- compare_kBET_results(reads.qc) require(&quot;reshape2&quot;) require(&quot;RColorBrewer&quot;) # Plot results dod &lt;- melt(as.matrix(eff_debatching), value.name = &quot;kBET&quot;) colnames(dod)[1:2] &lt;- c(&quot;Normalisation&quot;, &quot;Individual&quot;) colorset &lt;- c(&#39;gray&#39;, brewer.pal(n = 9, &quot;RdYlBu&quot;)) ggplot(dod, aes(Normalisation, Individual, fill=kBET)) + geom_tile() + scale_fill_gradient2( na.value = &quot;gray&quot;, low = colorset[2], mid=colorset[6], high = colorset[10], midpoint = 0.5, limit = c(0,1)) + scale_x_discrete(expand = c(0, 0)) + scale_y_discrete(expand = c(0, 0)) + theme( axis.text.x = element_text( angle = 45, vjust = 1, size = 12, hjust = 1 ) ) + ggtitle(&quot;Effect of batch regression methods per individual&quot;) 7.8 Feature Selection library(scRNA.seq.funcs) library(matrixStats) library(M3Drop) library(RColorBrewer) library(SingleCellExperiment) library(Polychrome) library(scater) library(scran) set.seed(1) Single-cell RNASeq is capable of measuring the expression of many thousands of genes in every cell. However, in most situations only a portion of those will show a response to the biological condition of interest, e.g. differences in cell-type, drivers of differentiation, respond to an environmental stimulus. Most genes detected in a scRNASeq experiment will only be detected at different levels due to technical noise. One consequence of this is that technical noise and batch effects can obscure the biological signal of interest. Thus, it is often advantageous to perform feature selection to remove those genes which only exhibit technical noise from downstream analysis. Not only does this generally increase the signal:noise ratio in the data; it also reduces the computational complexity of analyses, by reducing the total amount of data to be processed. For scRNASeq data, we will be focusing on unsupervised methods of feature selection which don’t require any a priori information, such as cell-type labels or biological group, since they are not available, or may be unreliable, for many experiments. In contrast, differential expression (chapter 12) can be considered a form of supervised feature selection since it uses the known biological label of each sample to identify features (i.e. genes) which are expressed at different levels across groups. For this section we will continue working with the Deng data. deng &lt;- readRDS(&quot;data/deng/deng-reads.rds&quot;) celltype_labs &lt;- colData(deng)$cell_type2 cell_colors &lt;- createPalette(10, c(&quot;#010101&quot;, &quot;#ff0000&quot;), M=1000) names(cell_colors) &lt;- unique(as.character(celltype_labs)) Feature selection is performed after QC, however this data has already been QCed so we can skip that step here. M3Drop contain two different feature selection methods “M3DropFeatureSelection” which is based on a Michaelis-Menten curve and is designed for full-transcript single-cell RNA-seq data (such as Smartseq2) and “NBumiFeatureSelectionCombinedDrop” which is based on a negative binomial model and is designed for UMI count data. We will demonstrate both on the Deng Smartseq2 data. M3Drop feature selection is runs direction on a normalized (but not log-transformed) expression matrix. This can be extracted from our SingleCellExperiment object using the command below. expr_matrix &lt;- M3Drop::M3DropConvertData(deng) ## [1] &quot;Removing 1134 undetected genes.&quot; This function is compatible with most single-cell RNA-seq analysis packages including: scater, SingleCellExperiment, monocle, and Seurat. It can also convert an existing expression matrix to the correct form (removing undetected genes &amp; normalizing/delogging) if you specify whether the matrix is raw counts, or log transformed. Check the manual for details: ?M3Drop::M3DropConvertData Exercise 1: Confirm that the conversion function has removed undetected genes: 7.8.1 Identifying Genes vs a Null Model There are two main approaches to unsupervised feature selection. The first is to identify genes which behave differently from a null model describing just the technical noise expected in the dataset. If the dataset contains spike-in RNAs they can be used to directly model technical noise. However, measurements of spike-ins may not experience the same technical noise as endogenous transcripts (Svensson et al., 2017). In addition, scRNASeq experiments often contain only a small number of spike-ins which reduces our confidence in fitted model parameters. 7.8.1.1 Highly Variable Genes - Brennecke method The first method proposed to identify features in scRNASeq datasets was to identify highly variable genes (HVG). HVG assumes that if genes have large differences in expression across cells some of those differences are due to biological difference between the cells rather than technical noise. However, because of the nature of count data, there is a positive relationship between the mean expression of a gene and the variance in the read counts across cells. This relationship must be corrected for to properly identify HVGs. Exercise 2 Using the functions rowMeans and rowVars to plot the relationship between mean expression and variance for all genes in this dataset. (Hint: use log=“xy” to plot on a log-scale). An early method to correct for the relationship between variance and mean expression was proposed by Brennecke et al.. To use the Brennecke method, we first normalize for library size then calculate the mean and the square coefficient of variation (variation divided by the squared mean expression). A quadratic curve is fit to the relationship between these two variables for the ERCC spike-in, and then a chi-square test is used to find genes significantly above the curve. This method is included in the M3Drop package as the Brennecke_getVariableGenes(counts, spikes) function. However, this dataset does not contain spike-ins so we will use the entire dataset to estimate the technical noise. In the figure below the red curve is the fitted technical noise model and the dashed line is the 95% CI. Pink dots are the genes with significant biological variability after multiple-testing correction. Brennecke_HVG &lt;- BrenneckeGetVariableGenes( expr_matrix, fdr = 0.01, minBiolDisp = 0.5 ) This function returns a matrix of significant genes as well as their estimated effect size (difference between observed and expected coefficient of variation), and their significance as raw p.values and FDR corrected q.values. For now we will just keep the names of the significant HVG genes. HVG_genes &lt;- Brennecke_HVG$Gene Exercise 3 How many genes were signifcant using BrenneckeGetVariableGenes? ## [1] 1303 7.8.1.2 Highly Variable Genes - simpleSingleCell method The Bioconductor simpleSingleCell workflow has a great deal of excellent material to help your analyses. Here, we show how to identify highly variable genes using functionality from the scran package. This method assumes that technical variance is captured by a Poisson distribution, and that variance beyond that explained by a Poisson distribution represents biological variance of interest. This approach separates the biological component of the variance from the technical component and thus can rank genes based on their “biological” variance. This model also provides p-values (with FDR adjustment) that can be used to identify the set of “significant” highly variable genes at a given significance level. ### mamke a technical trend of variance based on Poisson var.fit &lt;- trendVar(deng, parametric=TRUE, loess.args=list(span=0.4), use.spikes = FALSE) var.out &lt;- decomposeVar(deng, var.fit) plot(var.out$mean, var.out$total, pch=16, cex=0.6, xlab=&quot;Mean log-expression&quot;, ylab=&quot;Variance of log-expression&quot;) points(var.out$mean[isSpike(deng)], var.out$total[isSpike(deng)], col=&quot;red&quot;, pch=16) curve(var.fit$trend(x), col=&quot;dodgerblue&quot;, add=TRUE, lwd=2) chosen.genes &lt;- order(var.out$bio, decreasing=TRUE)[1:10] plotExpression(deng, rownames(var.out)[chosen.genes], point_alpha=0.5, jitter_type=&quot;jitter&quot;) top.dec &lt;- var.out[order(var.out$bio, decreasing=TRUE),] # the highly variable genes with largest biological components head(top.dec) ## DataFrame with 6 rows and 6 columns ## mean total bio ## &lt;numeric&gt; &lt;numeric&gt; &lt;numeric&gt; ## Obox6 7.0852220910669 39.7469062194493 27.7222625676479 ## BC053393 6.23846872763624 36.7868129334449 22.7409221497424 ## Krt18 8.06957111931139 30.7163256353151 21.3338604240051 ## Upp1 6.70443458808406 32.9196031154138 19.9537242012223 ## Akr1b8 9.31035205790714 25.9351262454146 19.563014227718 ## Spp1 5.52672835522051 34.8140952020968 19.5492807120572 ## tech p.value FDR ## &lt;numeric&gt; &lt;numeric&gt; &lt;numeric&gt; ## Obox6 12.0246436518013 6.67046481158613e-67 4.98750653962295e-64 ## BC053393 14.0458907837025 1.89687518927716e-40 5.90955657926056e-38 ## Krt18 9.38246521130992 1.28064383710762e-65 9.26649093876163e-63 ## Upp1 12.9658789141915 1.39045180596497e-37 3.89865305745004e-35 ## Akr1b8 6.37211201769662 2.70679041028919e-99 5.51963779029062e-96 ## Spp1 15.2648144900397 9.4641203490752e-29 1.76908069625088e-26 simplesinglecell_genes &lt;- rownames(top.dec)[top.dec$FDR &lt; 0.001] table(top.dec$FDR &lt; 0.001) ## ## FALSE TRUE ## 21124 1307 If we set an FDR threshold of 0.1%, this approach identifies around 1300 highly variable genes. The output of this variance modelling can be used as input to a denoisePCA() function to compute “denoised” principal components for clustering and other downstream analyses (details not shown here; please see the simpleSingleCell workflow). 7.8.1.3 High Dropout Genes An alternative to finding HVGs is to identify genes with unexpectedly high numbers of zeros. The frequency of zeros, known as the “dropout rate”, is very closely related to expression level in scRNASeq data. Zeros are the dominant feature of single-cell RNASeq data, typically accounting for over half of the entries in the final expression matrix. These zeros predominantly result from the failure of mRNAs failing to be reversed transcribed (Andrews and Hemberg, 2016). Reverse transcription is an enzyme reaction thus can be modelled using the Michaelis-Menten equation: \\[P_{dropout} = 1 - S/(K + S)\\] where \\(S\\) is the mRNA concentration in the cell (we will estimate this as average expression) and \\(K\\) is the Michaelis-Menten constant. Because the Michaelis-Menten equation is a convex non-linear function, genes which are differentially expression across two or more populations of cells in our dataset will be shifted up/right of the Michaelis-Menten model (see Figure below). K &lt;- 49 S_sim &lt;- 10^seq(from = -3, to = 4, by = 0.05) # range of expression values MM &lt;- 1 - S_sim / (K + S_sim) plot( S_sim, MM, type = &quot;l&quot;, lwd = 3, xlab = &quot;Expression&quot;, ylab = &quot;Dropout Rate&quot;, xlim = c(1,1000) ) S1 &lt;- 10 # Mean expression in population 1 P1 &lt;- 1 - S1 / (K + S1) # Dropouts for cells in condition 1 S2 &lt;- 750 # Mean expression in population 2 P2 &lt;- 1 - S2 / (K + S2) # Dropouts for cells in condition 2 points( c(S1, S2), c(P1, P2), pch = 16, col = &quot;grey85&quot;, cex = 3 ) mix &lt;- 0.5 # proportion of cells in condition 1 points( S1 * mix + S2 * (1 - mix), P1 * mix + P2 * (1 - mix), pch = 16, col = &quot;grey35&quot;, cex = 3 ) Note: add log=&quot;x&quot; to the plot call above to see how this looks on the log scale, which is used in M3Drop figures. Exercise 4: Produce the same plot as above with different expression levels (S1 &amp; S2) and/or mixtures (mix). We use M3Drop to identify significant outliers to the right of the MM curve. We also apply 1% FDR multiple testing correction: M3Drop_genes &lt;- M3DropFeatureSelection( expr_matrix, mt_method = &quot;fdr&quot;, mt_threshold = 0.01 ) M3Drop_genes &lt;- M3Drop_genes$Gene An alternative method is contained in the M3Drop package that is tailored specifically for UMI-tagged data which generally contains many zeros resulting from low sequencing coverage in addition to those resulting from insufficient reverse-transcription. This model is the Depth-Adjusted Negative Binomial (DANB). This method describes each expression observation as a negative binomial model with a mean related to both the mean expression of the respective gene and the sequencing depth of the respective cell, and a variance related to the mean-expression of the gene. This method is designed to model the raw counts in a dataset directly, and we can extract the appropriate matrix using the “NBumiConvertData” function similar to M3Drop. However, we have an extra step for fitting the model since that is the slowest step of the method and we are currently working on additional methods that can use this model information for other things (such as normalization, co-expression testing, highly variable gene detection). This method includes a binomial test of the significance of each feature, but since the Deng data is not UMI counts the model does not fit the noise sufficiently and far too many genes will be called as significant. Thus we will take the top 1500 by effect size. deng_int &lt;- NBumiConvertData(deng) ## [1] &quot;Removing 1134 undetected genes.&quot; DANB_fit &lt;- NBumiFitModel(deng_int) # DANB is fit to the raw count matrix # Perform DANB feature selection DropFS &lt;- NBumiFeatureSelectionCombinedDrop(DANB_fit, method=&quot;fdr&quot;, qval.thresh=0.01, suppress.plot=FALSE) DANB_genes &lt;- DropFS[1:1500,]$Gene Exercise 5 How many genes were signifcant using NBumiFeatureSelectionCombinedDrop? ## [1] 10694 7.8.1.4 Residual variance from a (regularized) negative binomial model In the normalization chapter we introduced the sctransform approach to using Pearson residuals from an regularized negative binomial generalized linear model to normalize scRNA-seq data. The residual variance of genes (i.e. the variance of the Pearson residuals) provides a way to identify highly variable genes, where the “variance” is decoupled from the average level of expression of the gene. The residual variance is easily accessible from the sctransform output as we show below. First, we run sctransform as we did previously. deng_sparse &lt;- as(counts(deng), &quot;dgCMatrix&quot;) ### Genes expressed in at least 5 cells will be kept sctnorm_data &lt;- sctransform::vst(umi = deng_sparse, min_cells = 1, cell_attr = as.data.frame(colData(deng)), latent_var = &quot;log10_total_counts_endogenous&quot;) sctnorm_data$model_str ## [1] &quot;y ~ log10_total_counts_endogenous&quot; library(ggplot2) ggplot(sctnorm_data$gene_attr, aes(residual_variance)) + geom_histogram(binwidth=0.1) + geom_vline(xintercept=1, color=&#39;red&#39;) + xlim(0, 10) + theme_bw() sctnorm_data$gene_attr$label &lt;- rownames(sctnorm_data$gene_attr) ggplot(sctnorm_data$gene_attr, aes(x = gmean, y=residual_variance)) + geom_point(alpha = 0.6) + geom_point(colour = &quot;firebrick2&quot;, data = sctnorm_data$gene_attr[sctnorm_data$gene_attr$residual_variance &gt; 3,]) + scale_x_log10() + geom_hline(yintercept = 1, size = 3, color = &quot;dodgerblue&quot;) + geom_label(aes(label = label), data = sctnorm_data$gene_attr[sctnorm_data$gene_attr$residual_variance &gt; 30,]) + theme_bw() sct_genes &lt;- rownames(sctnorm_data$gene_attr)[sctnorm_data$gene_attr$residual_variance &gt; 4] table(sctnorm_data$gene_attr$residual_variance &gt; 4) ## ## FALSE TRUE ## 20077 1220 If we set a (relatively arbitrary) threshold of a residual variance greater than three marking a “highly variable gene”, then we identify around 2000 highly variable genes with this sctransform approach. [NB: the deng data is extremely high depth for scRNA-seq data, so not the most applicable dataset for sctransform, but we include this analysis here to demonstrate the method rather than make any evaluation of its performance in general.] Although not explored here, the deviance statistic from the regularized NB GLM fit provides a natural way to select informative features for downstream analyses. The deviance is a goodness-of-fit statistic for a statistical model. As Wikipedia notes, deviance is a generalization of the idea of using the sum of squares of residuals in ordinary least squares to cases where model-fitting is achieved by maximum likelihood. It plays an important role in exponential dispersion models and generalized linear models, such as the negative binomial model. However, sctransform does not seem set up to use the model deviance to select informative features, but we expect this could be a direction the field goes in the near future. Keep an eye out! 7.8.2 Correlated Expression A completely different approach to feature selection is to use gene-gene correlations. This method is based on the idea that multiple genes will be differentially expressed between different cell-types or cell-states. Genes which are expressed in the same cell-population will be positively correlated with each other where as genes expressed in different cell-populations will be negatively correated with each other. Thus important genes can be identified by the magnitude of their correlation with other genes. The limitation of this method is that it assumes technical noise is random and independent for each cell, thus shouldn’t produce gene-gene correlations, but this assumption is violated by batch effects which are generally systematic between different experimental batches and will produce gene-gene correlations. As a result it is more appropriate to take the top few thousand genes as ranked by gene-gene correlation than consider the significance of the correlations. cor_feat &lt;- M3Drop::corFS(expr_matrix) Cor_genes &lt;- names(cor_feat)[1:1500] 7.8.3 Comparing Methods We can check whether the identified features really do represent genes differentially expressed between cell-types in this dataset. M3DropExpressionHeatmap( M3Drop_genes, expr_matrix, cell_labels = celltype_labs ) We can also consider how consistent each feature selection method is with the others using the Jaccard Index: J &lt;- sum(M3Drop_genes %in% HVG_genes)/length(unique(c(M3Drop_genes, HVG_genes))) Exercise 6 Plot the expression of the features for each of the other methods. Which appear to be differentially expressed? How consistent are the different methods for this dataset? Jaccard index comparison of sets of informative features: list_of_features &lt;- list( M3Drop_genes, DANB_genes, HVG_genes, simplesinglecell_genes, sct_genes ) Out &lt;- matrix( 0, ncol = length(list_of_features), nrow = length(list_of_features) ) for(i in 1:length(list_of_features) ) { for(j in 1:length(list_of_features) ) { Out[i,j] &lt;- sum(list_of_features[[i]] %in% list_of_features[[j]])/ length(unique(c(list_of_features[[i]], list_of_features[[j]]))) } } colnames(Out) &lt;- rownames(Out) &lt;- c(&quot;M3Drop&quot;, &quot;DANB&quot;, &quot;Brennecke&quot;, &quot;simpleSingleCell&quot;, &quot;sctransform&quot;) Out ## M3Drop DANB Brennecke simpleSingleCell ## M3Drop 1.0000000 0.38019061 0.4152905 0.14615908 ## DANB 0.3801906 1.00000000 0.2283346 0.09868187 ## Brennecke 0.4152905 0.22833459 1.0000000 0.15019157 ## simpleSingleCell 0.1461591 0.09868187 0.1501916 1.00000000 ## sctransform 0.2343257 0.21801471 0.2718985 0.26034913 ## sctransform ## M3Drop 0.2343257 ## DANB 0.2180147 ## Brennecke 0.2718985 ## simpleSingleCell 0.2603491 ## sctransform 1.0000000 7.8.4 sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] scran_1.12.1 scater_1.12.2 ## [3] ggplot2_3.2.1 Polychrome_1.2.3 ## [5] SingleCellExperiment_1.6.0 SummarizedExperiment_1.14.1 ## [7] DelayedArray_0.10.0 BiocParallel_1.18.1 ## [9] Biobase_2.44.0 GenomicRanges_1.36.1 ## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 ## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 ## [15] RColorBrewer_1.1-2 M3Drop_1.10.0 ## [17] numDeriv_2016.8-1.1 matrixStats_0.55.0 ## [19] scRNA.seq.funcs_0.1.0 ## ## loaded via a namespace (and not attached): ## [1] Rtsne_0.15 ggbeeswarm_0.6.0 ## [3] colorspace_1.4-1 dynamicTreeCut_1.63-1 ## [5] htmlTable_1.13.2 XVector_0.24.0 ## [7] base64enc_0.1-3 BiocNeighbors_1.2.0 ## [9] rstudioapi_0.10 listenv_0.7.0 ## [11] codetools_0.2-16 splines_3.6.0 ## [13] knitr_1.25 Formula_1.2-3 ## [15] cluster_2.1.0 sctransform_0.2.0 ## [17] compiler_3.6.0 dqrng_0.2.1 ## [19] backports_1.1.4 assertthat_0.2.1 ## [21] Matrix_1.2-17 lazyeval_0.2.2 ## [23] limma_3.40.6 BiocSingular_1.0.0 ## [25] acepack_1.4.1 htmltools_0.3.6 ## [27] tools_3.6.0 rsvd_1.0.2 ## [29] igraph_1.2.4.1 gtable_0.3.0 ## [31] glue_1.3.1 GenomeInfoDbData_1.2.1 ## [33] reshape2_1.4.3 dplyr_0.8.3 ## [35] Rcpp_1.0.2 bbmle_1.0.20 ## [37] gdata_2.18.0 nlme_3.1-139 ## [39] DelayedMatrixStats_1.6.1 xfun_0.9 ## [41] stringr_1.4.0 globals_0.12.4 ## [43] irlba_2.3.3 gtools_3.8.1 ## [45] hypergeo_1.2-13 statmod_1.4.32 ## [47] future_1.14.0 edgeR_3.26.8 ## [49] zlibbioc_1.30.0 MASS_7.3-51.1 ## [51] scales_1.0.0 yaml_2.2.0 ## [53] gridExtra_2.3 rpart_4.1-15 ## [55] latticeExtra_0.6-28 stringi_1.4.3 ## [57] checkmate_1.9.4 orthopolynom_1.0-5 ## [59] contfrac_1.1-12 caTools_1.17.1.2 ## [61] rlang_0.4.0 pkgconfig_2.0.3 ## [63] moments_0.14 bitops_1.0-6 ## [65] evaluate_0.14 lattice_0.20-38 ## [67] purrr_0.3.2 htmlwidgets_1.3 ## [69] labeling_0.3 cowplot_1.0.0 ## [71] tidyselect_0.2.5 deSolve_1.24 ## [73] plyr_1.8.4 magrittr_1.5 ## [75] bookdown_0.13 R6_2.4.0 ## [77] gplots_3.0.1.1 Hmisc_4.2-0 ## [79] pillar_1.4.2 foreign_0.8-70 ## [81] withr_2.1.2 mgcv_1.8-28 ## [83] survival_2.43-3 scatterplot3d_0.3-41 ## [85] RCurl_1.95-4.12 nnet_7.3-12 ## [87] future.apply_1.3.0 tibble_2.1.3 ## [89] crayon_1.3.4 KernSmooth_2.23-15 ## [91] rmarkdown_1.15 viridis_0.5.1 ## [93] locfit_1.5-9.1 grid_3.6.0 ## [95] data.table_1.12.2 reldist_1.6-6 ## [97] digest_0.6.21 elliptic_1.4-0 ## [99] munsell_0.5.0 beeswarm_0.2.3 ## [101] viridisLite_0.3.0 vipor_0.4.5 References "],
 ["handling-sparsity.html", "8 Handling sparsity 8.1 Challenge: Handling sparsity in single-cell RNA sequencing 8.2 Status 8.3 Open problems", " 8 Handling sparsity The material below is reproduced from (Laehnemann et al. 2019): Laehnemann,D. et al. (2019) 12 Grand challenges in single-cell data science PeerJ Preprints. link 8.1 Challenge: Handling sparsity in single-cell RNA sequencing A comprehensive characterization of the transcriptional status of individual cells enables us to gain full insight into the interplay of transcripts within single cells. However, scRNA-seq measurements typically suffer from large fractions of observed zeros, where a given gene in a given cell has no unique molecule identifiers or reads mapping to it. These observed zero values can represent either missing data (i.e.~a gene is expressed but not detected by the sequencing technology) or true absence of expression. The proportion of zeros, or degree of sparsity, is thought to be due to imperfect reverse transcription and amplification, and other technical limitations (), and depends on the scRNA-seq platform used, the sequencing depth and the underlying expression level of the gene. The term ``dropout’’ is often used to denote observed zero values in scRNA-seq data, but this term conflates zero values attributable to methodological noise and biologically-true zero expression, so we recommend against its use as a catch-all term for observed zeros. Sparsity in scRNA-seq data can hinder downstream analyses, but it is challenging to model or handle it appropriately, and thus, there remains an ongoing need for improved methods. Sparsity pervades all aspects of scRNA-seq data analysis, but here we focus on the linked problems of learning latent spaces and “imputing” expression values from scRNA-seq data. Imputation, “data smoothing” and “data reconstruction” approaches are closely linked to the challenges of normalization. But whereas normalization generally aims to make expression values between cells more comparable to each other, imputation and data smoothing approaches aim to achieve adjusted data values that—it is hoped—better represent the true expression values. Imputation methods could therefore be used for normalization, but do not entail all possible or useful approaches to normalization. 8.2 Status The imputation of missing values has been very successful for genotype data. Crucially, when imputing genotypes we often know which data are missing (e.g.~when no genotype call is possible due to no coverage of a locus, although see section for the challenges with data) and rich sources of external information are available (e.g.~haplotype reference panels). Thus, genotype imputation is now highly accurate and a commonly-used step in data processing for genetic association studies . The situation is somewhat different for scRNA-seq data, as we do not routinely have external reference information to apply. In addition, we can never be sure which observed zeros represent “missing data” and which accurately represent a true gene expression level in the cell . Observed zeros can either represent “biological” zeros, i.e.~those present because the true expression level of a gene in a cell was zero. Or they they are the result of methodological noise, which can arise when a gene has true non-zero expression in a cell, but no counts are observed due to failures at any point in the complicated process of processing mRNA transcripts in cells into mapped reads. Such noise can lead to artefactual zero that are either more systematic (e.g.~sequence-specific mRNA degradation during cell lysis) or that occur by chance (e.g.~barely expressed transcripts that at the same expression level will sometimes be detected and sometimes not, due to sampling variation, e.g~in the sequencing). The high degree of sparsity in scRNA-seq data therefore arises from technical zeros and true biological zeros, which are difficult to distinguish from one another. In general, two broad approaches can be applied to tackle this problem of sparsity: use statistical models that inherently model the sparsity, sampling variation and noise modes of scRNA-seq data with an appropriate data generative model; or attempt to ``impute’’ values for observed zeros (ideally the technical zeros; sometimes also non-zero values) that better approximate the true gene expression levels. We prefer to use the first option where possible, and for many single-cell data analysis problems, statistical models appropriate for sparse count data exist and should be used (e.g.~for differential expression analysis). However, there are many cases where the appropriate models are not available and accurate imputation of technical zeros would allow better results from downstream methods and algorithms that cannot handle sparse count data. For example, imputation could be particularly useful for many dimension reduction, visualization and clustering applications. It is therefore desirable to improve both statistical methods that work on sparse count data directly and approaches for data imputation for scRNA-seq data, whether by refining existing techniques or developing new ones (see also ). We define three broad (and sometimes overlapping) categories of methods that can be used to ``impute’’ scRNA-seq data in the absence of an external reference: Model-based imputation methods of technical zeros use probabilistic models to identify which observed zeros represent technical rather than biological zeros and aim to impute expression levels just for these technical zeros, leaving other observed expression levels untouched; or Data-smoothing methods define sets of “similar” cells (e.g.~cells that are neighbors in a graph or occupy a small region in a latent space) and adjust expression values for each cell based on expression values in similar cells. These methods adjust all expression values, including technical zeros, biological zeros and observed non-zero values. Data-reconstruction methods typically aim to define a latent space representation of the cells. This is often done through matrix factorization (e.g.~principal component analysis) or, increasingly, through machine learning approaches (e.g.~variational autoencoders that exploit deep neural networks to capture non-linear relationships). Although a broad class of methods, both matrix factorization methods and autoencoders (among others) are able to “reconstruct” the observed data matrix from low-rank or simplified representations. The reconstructed data matrix will typically no longer be sparse (with many zeros) and the implicitly “imputed” data can be used for downstream applications that cannot handle sparse count data. The first category of methods generally seeks to infer a probabilistic model that captures the data generation mechanism. Such generative models can be used to identify, probabilistically, which observed zeros correspond to technical zeros (to be imputed) and which correspond to biological zeros (to be left alone). There are many model-based imputation methods already available that use ideas from clustering (e.g.~k-means), dimension reduction, regression and other techniques to impute technical zeros, oftentimes combining ideas from several of these approaches. These include SAVER , ScImpute , bayNorm , scRecover , and VIPER . Clustering methods that implicitly impute values, such as CIDR and BISCUIT , are closely related to this class of imputation methods. Data-smoothing methods, which adjust all gene expression levels based on expression levels in “similar” cells, have also been proposed to handle imputation problems. We might regard these approaches as “denoising” methods. To take a simplified example, we might imagine that single cells originally refer to points in two-dimensional space, but are likely to describe a one-dimensional curve; projecting data points onto that curve eventually allows imputation of the “missing” values (but all points are adjusted, or smoothed, not just true technical zeros). Prominent data-smoothing approaches to handling sparse counts include: diffusion-based MAGIC k-nearest neighbor-based knn-smooth network diffusion-based netSmooth clustering-based DrImpute locality sensitive imputation in LSImpute A major task in the analysis of high-dimensional single-cell data is to find low-dimensional representations of the data that capture the salient biological signals and render the data more interpretable and amenable to further analyses. As it happens, the matrix factorization and latent-space learning methods used for that task also provide another route for imputation through their ability to reconstruct the observed data matrix from simplified representations of it. PCA is one such standard matrix factorization method that can be applied to scRNA-seq data (preferably after suitable data normalization) as are other widely-used general statistical methods like ICA and NMF. As (linear) matrix factorization methods, PCA, ICA and NMF decompose the observed data matrix into a “small” number of factors in two low-rank matrices, one representing cell-by-factor weights and one gene-by-factor loadings. Many matrix factorization methods with tweaks for single-cell data have been proposed in recent years, including: ZIFA, a zero-inflated factor analysis f-scLVM, a sparse Bayesian latent variable model GPLVM, a Gaussian process latent variable model ZINB-WaVE, a zero-inflated negative binomial factor model scCoGAPS, an extension of consensus , a meta-analysis approach to pCMF, probabilistic count matrix factorization with a Poisson model SDA, sparse decomposition of arrays; another sparse Bayesian method . Some data reconstruction approaches have been specifically proposed for imputation, including: ENHANCE, denoising with an aggregation step ALRA, SVD with adaptive thresholding scRMD, robust matrix decomposition Recently, machine learning methods have emerged that apply autoencoders and deep neural networks ) or ensemble learning ) to impute expression values. Additionally, many deep learning methods have been proposed for single-cell data analysis that can, but need not, use probabilistic data generative processes to capture low-dimensional or latent space representations of a dataset. Even if imputation is not a main focus, such methods can generate ``imputed’’ expression values as an upshot of a model primarily focused on other tasks like learning latent spaces, clustering, batch correction, or visualization (and often several of these tasks simultaneously). The latter set includes tools such as: DCA, an autoencoder with a zero-inflated negative binomial distribution scVI, a variational autoencoder with a zero-inflated negative binomial model LATE VASC compscVAE scScope Tybalt SAUCIE scvis net-SNE BERMUDA, focused on batch correction DUSC Expression Saliency others Besides the three categories described above, a small number of scRNA-seq imputation methods have been developed to incorporate information external to the current dataset for imputation. These include: ADImpute , which uses gene regulatory network information from external sources; SAVER-X , a transfer learning method for denoising and imputation that can use information from atlas-type resources; and methods that borrow information from matched bulk RNA-seq data like URSM and SCRABBLE . 8.3 Open problems A major challenge in this context is the circularity that arises when imputation solely relies on information that is internal to the imputed dataset. This circularity can artificially amplify the signal contained in the data, leading to inflated correlations between genes and/or cells. In turn, this can introduce false positives in downstream analyses such as differential expression testing and gene network inference . Handling batch effects and potential confounders requires further work to ensure that imputation methods do not mistake unwanted variation from technical sources for biological signal. In a similar vein, single-cell experiments are affected by various uncertainties (see ). Approaches that allow quantification and propagation of the uncertainties associated with expression measurements (), may help to avoid problems associated with ‘overimputation’ and the introduction of spurious signals noted by . To avoid this circularity, it is important to identify reliable external sources of information that can inform the imputation process. One possibility is to exploit external reference panels (like in the context of genetic association studies). Such panels are not generally available for scRNA-seq data, but ongoing efforts to develop large scale cell atlases could provide a valuable resource for this purpose. Systematic integration of known biological network structures is desirable and may also help to avoid circularity. A possible approach is to encode network structure knowledge as prior information, as attempted in netSmooth and ADImpute. Another alternative solution is to explore complementary types of data that can inform scRNA-seq imputation. This idea was adopted in SCRABBLE and URSM, where an external reference is defined by bulk expression measurements from the same population of cells for which imputation is performed. Yet another possibility could be to incorporate orthogonal information provided by different types of molecular measurements (see ). Methods designed to integrate multi-omics data could then be extended to enable scRNA-seq imputation, e.g.~through generative models that explicitly link scRNA-seq with other data types or by inferring a shared low-dimensional latent structure that could be used within a data-reconstruction framework. With the proliferation of alternative methods, comprehensive benchmarking is urgently required as for all areas of single-cell data analysis . Early attempts by and provide valuable insights into the performance of methods available at the time. But many more methods have since been proposed and even more comprehensive benchmarking platforms are needed. Many methods, especially those using deep learning, depend strongly on choice of hyperparameters . There, more detailed comparisons that explore parameter spaces would be helpful, extending work like that from comparing dimensionality reduction methods. Learning from exemplary benchmarking studies , it would be immensely beneficial to develop a community-supported benchmarking platform with a wide-range of synthetic and experiment ground-truth datasets (or as close as possible, in the case of experimental data) and a variety of thoughtful metrics for evaluating performance. Ideally, such a benchmarking platform would remain dynamic beyond an initial publication to allow ongoing comparison of methods as new approaches are proposed. Detailed benchmarking would also help to establish when normalization methods derived from explicit count models may be preferable to imputation. Finally, scalability for large numbers of cells remains an ongoing concern for imputation, data smoothing and data reconstruction methods, as for all high-throughput single-cell methods and software (see ). References "],
-["latent-spaces.html", "9 Latent spaces 9.1 Dimensionality reduction 9.2 Matrix factorization and factor analysis 9.3 Autoencoders", " 9 Latent spaces In many cases we may like to think of cells sitting in a low-dimensional, “latent” space that captures relationships between cells more intuitively than the very high-dimensional gene expression space. library(scater) library(SingleCellExperiment) library(glmpca) library(ggplot2) library(Polychrome) library(slalom) 9.1 Dimensionality reduction Why? - Reduce Curse of Dimensionality problems - Increase storage and computational efficiency - Visualize Data in 2D or 3D Difficulty: Need to decide how many dimension to keep. 9.1.1 PCA: Principal component analysis 9.1.1.1 (traditional) PCA PCA is a linear feature extraction technique. It performs a linear mapping of the data to a lower-dimensional space in such a way that the variance of the data in the low-dimensional representation is maximized. It does so by calculating the eigenvectors from the covariance matrix. The eigenvectors that correspond to the largest eigenvalues (the principal components) are used to reconstruct a significant fraction of the variance of the original data. In simpler terms, PCA combines your input features in a specific way that you can drop the least important feature while still retaining the most valuable parts of all of the features. As an added benefit, each of the new features or components created after PCA are all independent of one another. 9.1.1.1.1 Basic ideas of PCA Idea1: Dropping dimensions = Projection onto lower dimensional space     Which dimension should we keep? Idea2: more variantion = more information     But what if the plot is not readily to be projected onto either X or Y-axis? 9.1.1.1.2 Steps of PCA Step1: Rotation We want a set of axises (called Principle Components) that satisfies: -The 1st axis points to the direction where variantion is maximized, and so on -They are orthogonal to each other It can be shown that the eigen vectors of the covariance matrix satisfy these conditions, and the eigen vector according to the largest eigen value accounts for the most variation. Step2: Projection (3-dimesion \\(\\rightarrow\\) 2-dimension) 9.1.1.1.3 An example of PCA deng &lt;- readRDS(&quot;data/deng/deng-reads.rds&quot;) my_color1 &lt;- createPalette(6, c(&quot;#010101&quot;, &quot;#ff0000&quot;), M=1000) names(my_color1) &lt;- unique(as.character(deng$cell_type1)) my_color2 &lt;- createPalette(10, c(&quot;#010101&quot;, &quot;#ff0000&quot;), M=1000) names(my_color2) &lt;- unique(as.character(deng$cell_type2)) deng &lt;- runPCA(deng, ncomponents = 2) plotPCA(deng, colour_by = &quot;cell_type1&quot;) + scale_fill_manual(values = my_color1) plotPCA(deng, colour_by = &quot;cell_type2&quot;) + scale_fill_manual(values = my_color2) 9.1.1.1.4 Advantages and limits of PCA: Advantages: fast, easy to use and intuitive. Limits: Can lead to local inconsistency, i.e. far away points can become nearest neighbours. It is a linear projection, like casting a shadow, meaning it can’t capture non-linear dependencies. For instance, PCA would not be able to “unroll” the following structure.  9.1.1.2 GLM-PCA (Collins, Dasgupta, and Schapire 2002) (Townes et al. 2019) GLM-PCA is a generalized version of the traditional PCA. The traditional PCA implicitly imposes an assumption of Gaussian distribution. The purpose of GLM-PCA is to loosen this condition to accommodate other distributions of the exponential family. Why does PCA assume a Gaussian distribution? Let \\(x_1, \\dots, x_n \\in \\mathcal{R}^d\\) be the \\(d\\)-dimensional data observed. PCA is looking for their projections onto a subspace: \\(u_1, \\dots, u_n\\), such that \\(\\sum_{i = 1}^n \\Vert x_i - u_i\\Vert^2\\) is minimized. This objective function can be interpretated in two ways: Interpretation 1: the variance of the projections/principal components: \\(\\sum_{i} \\Vert u_i \\Vert ^2\\), if the data is centered at the origin (\\(\\sum_{i} x_i = 0\\)); Interpretation 2: Each point \\(x_i\\) is thought of as a random draw from a probability distribution centered at \\(u_i\\). If we take this probability as a unit Gaussian, that is \\(x_i \\sim N(u_i, 1)\\), then the likelihood is \\(\\prod_{i = 1}^n \\exp (- \\Vert x_i - u_i\\Vert^2)\\), and the negative log likelihood is exactly the objective function. This assumption is often inappropriate for non-Gaussian distributed data, for example discrete data. Therefore, GLM-PCA generalizes the Gaussian likelihood into a likelihood of any exponential-family distribution, and applies appropriate link functions to \\(u_i\\)’s in the same as a GLM does to non-Gaussian responses. The following example compares GLM-PCA with Poisson marginals to the traditional PCA, which is identical to the result from plotPCA. ## GLM-PCA Y &lt;- assay(deng, &quot;counts&quot;) Y &lt;- Y[rowSums(Y) &gt; 0, ] system.time(res1 &lt;- glmpca(Y, L=2, fam=&quot;poi&quot;, verbose=TRUE)) ## user system elapsed ## 87.515 23.205 110.738 pd1 &lt;- data.frame(res1$factors, dimreduce=&quot;glmpca-poisson&quot;, clust = factor(deng$cell_type2)) ## traditional PCA pd2 &lt;- data.frame(reducedDim(deng, &quot;PCA&quot;), dimreduce=&quot;runPCA&quot;, clust = factor(deng$cell_type2)) colnames(pd2) &lt;- colnames(pd1) ## plot pd &lt;- rbind(pd1, pd2) ggplot(pd, aes(x = dim1, y = dim2, colour = clust)) + geom_point(size=2) + facet_wrap(~dimreduce, scales=&quot;free&quot;, nrow=3) + scale_color_manual(values = my_color2) + theme_bw() Let us compare GLM-PCA and standard PCA (using normalized log-counts data) on the Tung data, before cells have been QC’d. Repeat these plots with the QC’d Tung data. 9.1.2 tSNE: t-Distributed Stochastic Neighbor Embedding t-SNE (Maaten and Hinton 2008) is an advanced version of the original SNE algorithm. (Hinton and Roweis 2003) 9.1.2.1 Motivation The weakness of PCA is the motivation behind the SNE algorithm. PCA focuses on global covariance structrue, which lead to local inconsistency. SNE aims to preserve local strucutrue, or preserving the relationships among data points (ie. similar points remain similar; distinct points remain distinct). Unlike PCA, SNE is not limited to linear projections, which makes it suited to all sorts of datasets, including the swiss-roll data we have seen above. t-SNE solves the crowding issue of the original SNE. 9.1.2.2 original SNE SNE minimizes the divergence between two distributions: a distribution that measures pairwise similarities of the input objects and a distribution that measures pairwise similarities of the corresponding low-dimensional points in the embedding. Goal: preserve neighbourhoods. Soft neighbourhood: For each data point \\(x_i\\), the \\(i\\rightarrow j\\) probability is the probability that point \\(x_i\\) chooses \\(x_j\\) as its neighbour: \\(p_{j|i} \\propto \\exp(-\\Vert x_i - x_j \\Vert^2/2\\delta^2)\\). (This can be thought of as the probability of \\(x_j\\) in \\(N(x_i, \\delta)\\)) \\(\\Vert x_i - x_j \\Vert^2\\) is the Euclidean distance: The closer \\(x_i\\) and \\(x_j\\) are, the larger \\(p_{j|i}\\) is. \\(\\delta^2\\) denotes the vairance, it sets the size of the neighbourhood.   Very low \\(\\Rightarrow\\) all the probability is in the nearest neighbour   Very high \\(\\Rightarrow\\) uniform weights We generally want \\(\\delta^2\\) to be small for points in densely populated areas and large for sparse areas, so that the number of neighbours of all data points are roughly the same. It is computed with a user specified parameter (perplexity) which indicates the effective number of neighbours for a data point. Similarity matrix Collect \\(p_{j|i}\\) for all data points into a matrix, then this matrix preserves the key information of the local neighbourhood structure. How SNE works: Given high-dimensional data \\(X = \\{x_1, \\dots, x_n \\in \\mathcal{R}^d \\}\\), obtain the similarity matrix \\(P\\); Let \\(Y =\\{y_1, \\dots, y_n \\in \\mathcal{R}^2\\}\\) be a 2-dimensional data, the coordinates for visualization. Obtain a smilarity matrix of \\(Y\\), denoted as \\(Q\\), in the same way as \\(X\\), except that \\(\\delta^2\\) is fixed at 1/2. Look for \\(Y\\) such that \\(Q\\) is as similar to \\(P\\) as possible. Measurement of how similar two distributions is: Kullback-Leibler divergence (The definition of this cost function and the optimization procedure are out of the scope of this course) 9.1.2.3 t-SNE The motivation of t-SNE is to solve one of the main issues of SNE, the crowding problem. Crowding problem: In high dimension we have more room, points can have a lot of different neighbours that are far apart from each other. But in low dimensions, we don’t have enough room to accommodate all neighbours. Forexample, in 2D a point can have a few neighbors at distance one all far from each other - what happens when we embed in 1D? Solution: Change the distribution of the low-dimensional data \\(Q\\) into a student-t distribution. Recall that SNE is trying to minimize the dissimilarity of \\(P\\) and \\(Q\\), and \\(P\\) has a Gaussian distribution. So for a pair of points (\\(x_i\\) and \\(x_j\\) in high-dimension, \\(y_i\\) and \\(y_j\\) in low-dimension) to reach the same probability, the distance between \\(y_i\\) and \\(y_j\\) would be much larger (i.e. much farther apart). 9.1.2.4 Example of t-SNE:  muraro &lt;- readRDS(&quot;data/pancreas/muraro.rds&quot;) tmp &lt;- runTSNE(muraro, perplexity = 3) plotTSNE(tmp, colour_by = &quot;cell_type1&quot;) tmp &lt;- runTSNE(muraro, perplexity = 50) plotTSNE(tmp, colour_by = &quot;cell_type1&quot;) 9.1.2.5 Limits of t-SNE: Not a convex problem, i.e. the cost function has multiple local minima. Non-deterministic. Require specification of very important parameters, e.g. perplexity. Coordinates after embedding have no meaning. Therefore can merely be used for visualization. (See here for more pitfalls of using t-SNE.) 9.1.3 Manifold methods 9.1.3.1 UMAP: Uniform Manifold Approximation and Projection (McInnes, Healy, and Melville 2018) 9.1.3.1.1 Advantages of UMAP over t-SNE: faster deterministic better at preserving clusters 9.1.3.1.2 High level description Construct a topological presentation of the high-dimensional data (in this case a weighted \\(k\\)-NN graph) Given a low-dimensional data, construct a graph in the similar way Minimize the dissimilarity between the twp graphs. (Look for the low-dimensional data whose graph is the closest to that of the high-dimensional data) 9.1.3.1.3 Some details How the weighted graph is built? Obtain dissimilarity from the input distance: For each data point \\(x_i\\), find its \\(k\\) nearest neighbours: \\(x_{i_1}, \\dots, x_{i_k}\\). Let \\(d(x_i, x_{i_j})\\) be the input or original distance between \\(x_i\\) and \\(x_{i_j}\\), and \\(\\rho_i = \\min[d(x_i, x_{i_j}); 1 \\leq j \\leq k]\\) be the distance between \\(x_i\\) and its nearest neighbour. Then the dissimilarity between \\(x_i\\) and \\(x_{i_j}\\) is measured simply by subtracting the original distance by \\(\\rho_i\\): \\(\\tilde{d}(x_i, x_{i_j}) = d(x_i, x_{i_j}) - \\rho_i\\). Tranform dissimilarity to similarity: \\(s(x_i, x_{i_j}) = \\exp[-\\tilde{d}(x_i, x_{i_j})] - c_i\\), where \\(c_i\\) is a scale factor to ensure \\(\\sum_{j = 1}^k s(x_i, x_{i_j})\\) is a constant for all \\(i\\). Similarity itself can serve as edge weights, but this similarity is not symmetrical, i.e. \\(s(x_i, x_{i_j}) \\neq s(x_{i_j}, x_i)\\). To be able to project this onto an undirected graph, we need to solve the disagreement between \\(s(x_i, x_{i_j})\\) and \\(s(x_{i_j}, x_i)\\). Obtain weights: \\(w(x_i, x_{i_j}) = s(x_i, x_{i_j}) + s(x_{i_j}, x_i) - s(x_i, x_{i_j}) * s(x_{i_j}, x_i)\\) (Interpretation: \\(P(A \\cup B ) = P(A) + P(B) - P(A)P(B)\\) if \\(A\\) and \\(B\\) are independent) How the dissimilarity between graphs are measured? Cross entropy 9.1.3.1.4 Example of UMAP muraro &lt;- runUMAP(muraro) plotUMAP(muraro, colour_by=&quot;cell_type1&quot;) 9.1.3.2 PHATE (Moon et al. 2017) 9.1.3.2.1 Sketch of algorithm The simpliest description of PHATE: Step1. Create a dissimilarity matrix of the original data Step2. Feed the dissimilarity matrix to nonmetric MDS (MDS: Multi-Dimension Scaling is a classical dimensionality reduction approach, that takes an input of distance matrix, and aims at preserving pairwise distances in the low dimensional space. When the input distance matrix is Euclidean distance, MDS produces the same result as PCA. Nonmetric MDS generalize the input as a dissimilarity matrix, rather than just distance.) Details of step1 in PHATE Step1-1. Markov transition matrix - What is similar with SNE: Recall that in the original SNE algorithm, there is a similarity matrix with entry \\(p_{i|j}\\) that is interpreted as the probability that point \\(x_i\\) chooses \\(x_j\\) as its neighbour: \\(p_{j|i} \\propto \\exp(-\\Vert x_i - x_j \\Vert^2/2\\delta^2)\\). PHATE is doing the same, except that we can interpret it differently: i. We can think \\(p_{j|i}\\) as a Gaussian kernel, where \\(\\epsilon \\triangleq 2\\delta^2\\) is the bandwidth: \\(p_{j|i} \\triangleq K_\\epsilon(x_i, x_j )\\). Similar to SNE, PHATE also define \\(\\epsilon\\) as the \\(k\\)-NN distance of each data point, so that it is smaller in dense area and larger in sparse area. The \\(k\\) is a user-specified tuning parameter, similar to perplexity in SNE. ii. We can think of the similarity matrix as a transition matrix, where \\(p_{j|i}\\) represents the probability of jumping from state \\(i\\) to state \\(j\\) in a single step. - What is different: i. PHATE generalize \\(K_\\epsilon(x_i, x_j)\\) to \\(\\exp \\left(- \\Vert x_i - x_j \\Vert^\\alpha /\\epsilon(x_i)^\\alpha\\right)\\), where the original Gaussian kernel is the special case when \\(\\alpha = 2\\). The motivation is that if the data is very sparse in some regions, then the bandwith \\(\\epsilon\\) with be very large and the kernel will become flat and lose the local information. By letting \\(\\alpha &gt; 2\\), we prevent this to happen, although \\(\\alpha\\) needs to be provided by the user.   ii. Note that the kernels are not symmetrical now, that is \\(K_\\epsilon(x_i, x_j) \\neq K_\\epsilon(x_j, x_i)\\). So we make it symmetrical by taking an average of the two.   Step1-2. Smoothing - \\(P\\) is the transition matrix where \\(p_{i, j}\\) represents the probability of jumping from state \\(i\\) to state \\(j\\) in a single step. - Denote \\(\\delta_x\\) as a row vector of length \\(n\\) (the number of data points), where only the entry corresponding to \\(x\\) is 1 and zero everywhere else. Then \\(p_x = \\delta_x P\\) is the probability distribution of the data points starting from \\(x\\) after one step, and \\(p_x^t = \\delta_x P^t\\) is the probability distribution of the data points after \\(t\\) steps. In general, the more steps we take, the more data points will have positive probabilities. One way to think about this, is the larger \\(t\\) is, the more global information and the less local information is encoded. In the extreme case, if we take infinity steps, \\(p_x^\\infty\\) will be the same for all \\(x\\)’s, i.e. the probability distribution is going to be the same regardless of where we start, in this case, the local information is completely lost. An appropriately chosen \\(t\\) is crucial for the balance between local and global information in the embedding. (See the original paper for details of choosing \\(t\\)) Step1-3. Distance measurement Instead of measuring directly the Eudlicean distance between data points, say \\(x_i\\) and \\(x_j\\), PHATE measures the distance between probability distributions \\(p_{x_i}^t\\) and \\(p_{x_j}^t\\): \\(D^t(x_i, x_j) = \\Vert \\log(p_{x_i}^t) - \\log(p_{x_j}^t) \\Vert^2\\) 9.1.3.2.2 Example of PHATE library(phateR) deng_phate &lt;- phate(t(assay(deng, &quot;logcounts&quot;))) dt &lt;- data.frame(deng_phate$embedding, clust = deng$cell_type1) palette(rainbow(10)) ggplot(dt, aes(x=PHATE1, y=PHATE2, color=clust)) + geom_point() 9.2 Matrix factorization and factor analysis The key concept of factor analysis: The original, observed variables are correlated because they are all associated with some unobservable variables, the latent factors. It looks similar to PCA, but instead of dimensionality reduction, factor analysis focuses on studying the latent factors. The variance of an observed variable can be splitted into two parts: - Common variance: the part of variance that is explained by latent factors; - Unique variance: the part that is specific to only one variable, usually considered as an error component or residual. The factor loadings or weights indicate how much each latent factor is affecting the observed features. 9.2.1 Slalom: Interpretable latent spaces Highlight of Slalom: (Buettner et al. 2017) It incorporates prior information to help the model estimation; It learns whatever not provided by prior knowledge in the model training process; It enforces sparsity in the weight matrix. 9.2.1.1 Methodology Matrix expression of factor analysis: How prior knowledge affects the model: \\(I_{g, k}\\): (observed) Indicator of whether a gene \\(g\\) is annotated to a given pathway or factor \\(k\\); \\(z_{g, k}\\): (latent) Indicator of whether factor \\(k\\) has a regulatory effect on gene \\(g\\); \\(w_{g, k}\\): (estimated) weights. grey arrow: \\[ P(I_{g, k}\\vert z_{g, k}) = \\begin{cases} \\text{Bernoulli}(p_1), \\text{if } z_{g, k} = 1\\\\ \\text{Bernoulli}(p_2), \\text{if } z_{g, k} = 0\\\\ \\end{cases}\\] green arrow: \\[ P(w_{g, k}\\vert z_{g, k}) = \\begin{cases} N(w_{g, k}, 1/\\alpha), \\text{ if } z_{g, k} = 1\\\\ \\delta_0(w_{g, k}), \\text{ if } z_{g, k} = 0\\\\ \\end{cases}\\] We only look at the part of the likelihood that is relavant to this part: \\(\\prod_{g} \\prod_{k}P(I_{g, k}, w_{g, k}, z_{g, k})\\), where \\(P(I_{g, k}, w_{g, k}, z_{g, k}) = P(I_{g, k}, w_{g, k}| z_{g, k})P(z_{g,k}) = P( I_{g, k}| z_{g, k})P( w_{g, k}| z_{g, k})P(z_{g,k})\\). Since we do not know anything about \\(z_{g,k}\\), it is assumed as Bernoulli(1/2). 9.2.1.2 Example First, get a geneset in a GeneSetCollection object. gmtfile &lt;- system.file(&quot;extdata&quot;, &quot;reactome_subset.gmt&quot;, package = &quot;slalom&quot;) genesets &lt;- GSEABase::getGmt(gmtfile) Then we create an Rcpp_SlalomModel object containing the input data and genesets (and subsequent results) for the model. model_deng &lt;- newSlalomModel(deng, genesets, n_hidden = 5, min_genes = 10) ## 29 annotated factors retained; 1 annotated factors dropped. ## 1072 genes retained for analysis. Initialize the model: model_deng &lt;- initSlalom(model_deng, seed = 100) Fit/train the model: model_deng &lt;- trainSlalom(model_deng, nIterations = 1000, seed = 100, tolerance = 0.001) ## pre-training model for faster convergence ## iteration 0 ## Model not converged after 50 iterations. ## iteration 0 ## Model not converged after 50 iterations. ## iteration 0 ## Switched off factor 29 ## Switched off factor 20 ## Switched off factor 32 ## Switched off factor 28 ## Switched off factor 13 ## Switched off factor 27 ## Switched off factor 10 ## iteration 100 ## Switched off factor 22 ## iteration 200 ## iteration 300 ## iteration 400 ## iteration 500 ## iteration 600 ## iteration 700 ## Model converged after 701 iterations. View results: The plotRelevance function displays the most relevant terms (factors/pathways) ranked by relevance, showing gene set size and the number of genes gained/lost as active in the pathway as learnt by the model. plotRelevance(model_deng) + theme_classic(base_size = 8) ## NULL The plotTerms function shows the relevance of all terms in the model, enabling the identification of the most important pathways in the context of all that were included in the model. plotTerms(model_deng) 9.3 Autoencoders (Kingma and Welling 2013) 9.3.1 Background and some notations Data: \\(X\\) Latent variables: \\(Z\\) Something that is not directly observable but is assumed to have an impact on the observed variables. Goal: We believe \\(X\\) can be generated from \\(Z\\) (with some trasformation), and want to sample more data from \\(Z\\) that resembles \\(X\\). So we want the to find the parameters \\(\\theta\\) such that the probability to generate \\(X\\) from the distribution of \\(Z\\): \\(P(X) = \\int P(X|z; \\theta) P(z) dz\\) is maximized. How do we define \\(Z\\)? The simplest idea: \\(Z \\sim N(0, 1)\\). It is not impossible, because “any distribution in d dimensions can be generated by taking a set of d variables that are normally distributed and mapping them through a sufficiently complicated function.” -A better idea: For most of \\(z\\), \\(P(X|z; \\theta)\\) will be close to zero, meaning it contribute almost nothing to the estimate of \\(P(X)\\). Thus, we want to sample only those values of \\(Z\\) that are likely to produce \\(X\\). Denote this distribution of \\(Z\\) as \\(Q(Z|X)\\) (it is infered and therefore depend on \\(X\\)). Advantage: There will be a lot less possible values of \\(Z\\) under \\(Q\\) compared to random sampling, therefore, it will be easier to compute \\(E_{Z \\sim Q} P(X|Z)\\). 9.3.2 Objective \\[ \\log P(X) - KL[Q(Z|X)\\Vert P(Z|X)] = E_{Z\\sim Q}[\\log P(X|Z)] - KL[Q(Z|X)\\Vert P(Z)]\\] We can get this equation by starting from the definition of Kullback-Leibler divergence, combined with the Bayesian formula and a little algebra. (Not showing details here) LHS: what we want to maximize: Generation loss: how likely the generated samples resembles \\(X\\) - an error term which measures how much information is lost when using \\(Q\\) to represent \\(P\\), it becomes small if \\(Q\\) is high-capacity. (A loose explanation of model capacity: Roughly speaking, the capacity of a model describes how complex a relationship it can model. You could expect a model with higher capacity to be able to model more relationships between more variables than a model with a lower capacity.) RHS: what we can maximize through stochastic gradient descent. References "],
-["clustering-and-cell-annotation.html", "10 Clustering and cell annotation 10.1 Clustering Methods 10.2 Clustering example 10.3 An alternative to clustering: Automatic cell annotation", " 10 Clustering and cell annotation 10.1 Clustering Methods Once we have normalized the data and removed confounders we can carry out analyses that are relevant to the biological questions at hand. The exact nature of the analysis depends on the dataset. Nevertheless, there are a few aspects that are useful in a wide range of contexts and we will be discussing some of them in the next few chapters. We will start with the clustering of scRNA-seq data. 10.1.1 Introduction One of the most promising applications of scRNA-seq is de novo discovery and annotation of cell-types based on transcription profiles. Computationally, this is a hard problem as it amounts to unsupervised clustering. That is, we need to identify groups of cells based on the similarities of the transcriptomes without any prior knowledge of the labels. Moreover, in most situations we do not even know the number of clusters a priori. The problem is made even more challenging due to the high level of noise (both technical and biological) and the large number of dimensions (i.e. genes). When working with large datasets, it can often be beneficial to apply some sort of dimensionality reduction method. By projecting the data onto a lower-dimensional sub-space, one is often able to significantly reduce the amount of noise. An additional benefit is that it is typically much easier to visualize the data in a 2 or 3-dimensional subspace. We have already discussed PCA (chapter 6.6.2) and t-SNE (chapter 6.6.2). Challenges in clustering What is the number of clusters k? What defines a good clustering? What is a cell type? Scalability: in the last few years the number of cells in scRNA-seq experiments has grown by several orders of magnitude from ~\\(10^2\\) to ~\\(10^6\\) 10.1.2 Unsupervised clustering methods Three main ingredients of a complete clustering method: Measure of similarity: how do we quantify how close two data points are? Quality function: how do we decide how “good” is a clustering/partition? Algorithm: how to find the clustering whose quality function is optimized? 10.1.2.1 Hierarchical clustering Hierarchical clustering is basically the only type of clustering algorithm that does not seek to optimize a quality function, because it builds a hierarchy of clusters, instead of one single clustering result as the output. There are two types of strategies: - Agglomerative (bottom-up): each observation starts in its own cluster, and pairs of clusters are merged as one moves up the hierarchy. - Divisive (top-down): all observations start in one cluster, and splits are performed recursively as one moves down the hierarchy.   10.1.2.2 k-means clustering Measure of similarity: Euclidean distance Quality function: Within cluster distance Algorithm: Advantage: Fast Drawbacks: - Sensitive to initial clustering - Sensitive to outliers - Need to specify K - Tend to find clusters of similar sizes Tools related to K-means: SC3 10.1.2.3 Graph-based methods Real world networks usualy display big inhomogeneities or community structure. Communities or clusters or modules are groups of vertices which probably share common properties and/or play similar roles whithin the graph. In recent years there has been a lot of interest in detecting communities in networks in various domains. Some of these community detection methods can be applied to scRNA-seq data by building a graph where each vertice represents a cell and (weight of) the edge measures similarity between two cells. Actually, graph-based clustering is the most popular clustering algorithm in scRNA-seq data analysis, and has been reported to have outperformed other clustering methods in many situations (Freytag et al. 2018). 10.1.2.3.1 Why do we want to represent the data as a graph? Memory effectiveness: A (complete) graph can be thought as an alternative expression of similarity matrix. Current methods (discuss later) aim to build sparse graphs, which ease the memory burden. Curse of dimensionality: All data become sparse in high-dimensional space and therefore similarities measured by Euclidean distances etc are generally low between all objects. 10.1.2.3.2 Building a graph Step1: Build an unweighted K-nearest neighbour (KNN) graph Step2: Add weights, and obtain a shared nearest neighbour (SNN) graph There are two ways of adding weights: number and rank. - number: The number of shared nodes between \\(u\\) and \\(v\\), in this case, 3. - rank: A measurement of the closeness to their common nearest neighbours. ((???)) Details of rank : Main idea: The closeness of two people is defined by their closest common friend. For each node, say \\(u\\), we can rank its 5 neighbours according to their closeness to \\(u\\), and we can do the same with \\(v\\). Denote the three shared neighbours as \\(x_1\\), \\(x_2\\) and \\(x_3\\), so rank(\\(x_1, u\\)) = 1 means \\(x_1\\) is the closest neighbour of \\(u\\). The idea is, if \\(x_1\\) is also the closest to \\(v\\), then \\(u\\) and \\(v\\) should have a larger similarity, or weight. So we summarize the overall closeness of \\(x_1\\) with both \\(u\\) and \\(v\\) by taking an average: \\(\\dfrac{1}{2}(\\text{rank}(x_1, u), \\text{rank}(x_1, v))\\). Then we find the one with the largest closeness, \\(s(u, v) = \\min \\left[ \\dfrac{1}{2}(\\text{rank}(x_i, u), \\text{rank}(x_i, v)) \\vert i = 1, 2, 3\\right]\\). The final expression of weight: \\[ w(u, v) = K - s(u, v).\\] 10.1.2.3.3 Quality function (Modularity) Modularity (Newman and Girvan 2004) is not the only quality function for graph-based clustering, but it is one of the first attempts to embed in a compact form many questions including the definition of quality function and null model etc. The idea of modularity: A random graph should not have a cluster structure. The more “quality” a partition has compared to a random graph, the “better” the partition is. Specifically, it is defined by: the quality of a partition on the actual graph \\(-\\) the quality of the same partition on a random graph quality : Sum of the weights within clusters random graph : a copy of the original graph, with some of its properties, but without community structure. The random graph defined by modularity is: each node has the same degree as the original graph. \\[ Q \\propto \\sum_{i, j} A_{i, j} \\delta(i, j) - \\sum_{i, j} \\dfrac{k_i k_j}{2m} \\delta(i, j)\\] \\(A_{i, j}\\): weight between node \\(i\\) and \\(j\\); \\(\\delta(i, j)\\): indicator of whether \\(i\\) and \\(j\\) are in the same cluster; \\(k_i\\): the degree of node \\(i\\) (the sum of weights of all edges connected to \\(i\\)); \\(m\\): the total weight in the all graph. Higher modularity implies better partition: Limits of modularity: (Good, De Montjoye, and Clauset 2010) 1. Resolution limit. Short version: Modularity maximization forces small communities into larger ones. Longer version: For two clusters \\(A\\) and \\(B\\), if \\(k_A k_B &lt; 2m\\) then modularity increases by merging A and B into a single cluster, even if A and B are distinct clusters. 2. Bad, even random partitions may have a high modularity. Networks lack a clear modularity maxima. 10.1.2.3.4 Algorithms : Modularity-based clustering methods implemented in single cell analysis are mostly greedy algorithms, that are very fast, although not the most accurate approaches.     Louvain: (Blondel et al. 2008)     Leiden:(Traag, Waltman, and Eck 2019) Improved Louvain, hybrid of greedy algorithm and sampling technique 10.1.2.3.5 Advantages: -Fast -No need to specify \\(k\\) 10.1.2.3.6 Tools for graph-based clustering: Seurat: Louvain, Leiden, SLM igraph: fast greedy, Louvain, optimal, walktrap, spinglass, infomap 10.1.2.4 Consensus clustering (more robustness, less computational speed) 10.1.2.4.1 Motivation (Two problems of \\(K\\)-means): Problem1: sensitive to initial partitions   Solution:   Run multiple iterations of \\(K\\)-means on different subsamples of the original dataset, with different initail partitions. Problem2: the selection of \\(K\\).   Solution:   Run \\(K\\)-means with a range of \\(K\\)’s. 10.1.2.4.2 Algorithm of consensus clustering (simpliest version): for(k in the range of K){ for(each subsample of the data){ for(iteration in 1:1000){ kmeans(subsample, k) # each iteration means a different initial partition save partition } } return consensus clustering result of k } 10.1.2.4.3 Subsample obtained by dimensional reduction: steps of PCA: i) tranformation of the similarity matrix. ii) ranking eigen vectors according to their accoring eigen values in decreasing order. iii) need to decide how many (\\(d\\)) PC’s or eigenvalues we wants to reduce to. In SC3, i) considers two types of transformation: the one with traditional PCA and the associated graph Laplacian. iii) User may specify a range of \\(d\\), or use the default range suggested by the authors according to their experience with empirical results. 10.1.2.4.4 Consensus clustering (combining multiple clustering results): Step1: Represent each partition as a matrix: Say we partitioned four data points into 2 clusters. Step2: Consensus matrix: Average of all the partitions 10.1.2.4.5 Tools for consensus clustering: SC3 10.2 Clustering example library(pcaMethods) library(SC3) library(scater) library(SingleCellExperiment) library(pheatmap) library(mclust) library(igraph) library(scran) 10.2.1 Example 1. Graph-based clustering (deng dataset) To illustrate clustering of scRNA-seq data, we consider the Deng dataset of cells from developing mouse embryo (Deng et al. 2014). We have preprocessed the dataset and created a SingleCellExperiment object in advance. We have also annotated the cells with the cell types identified in the original publication (it is the cell_type2 column in the colData slot). deng &lt;- readRDS(&quot;data/deng/deng-reads.rds&quot;) First, we build a \\(K\\)-NN graph with a package function from scran. The most important decision of building a graph is the choice of \\(K\\), of which there is no standard rule. In general, we can think of it as an indication of the desired cluster size. If \\(K\\) is too small, a genuine cluster might be split into parts, while if \\(K\\) is too large, clusters might not thoroughly separated. deng5 &lt;- buildSNNGraph(deng, k = 5) deng15 &lt;- buildSNNGraph(deng, k = 15) deng25 &lt;- buildSNNGraph(deng, k = 25) par(mfrow=c(1,3)) plot(deng5, vertex.size = 4, vertex.label = NA) title(&quot;5-NN&quot; ,line = -33, cex.main = 3) plot(deng15, vertex.size = 4, vertex.label = NA) title(&quot;15-NN&quot; ,line = -33, cex.main = 3) plot(deng25, vertex.size = 4, vertex.label = NA) title(&quot;25-NN&quot; ,line = -33, cex.main = 3) Perform Louvain clustering: cl &lt;- igraph::cluster_louvain(deng15)$membership colData(deng)$cl &lt;- factor(cl) mclust::adjustedRandIndex(colData(deng)$cell_type2, colData(deng)$cl) ## [1] 0.4197754 Reaches very high similarity with the labels provided in the original paper. However, it tend to merge small clusters into larger ones. table(deng$cell_type2, cl) ## cl ## 1 2 3 ## 16cell 49 0 1 ## 4cell 0 14 0 ## 8cell 36 0 1 ## early2cell 0 8 0 ## earlyblast 0 0 43 ## late2cell 0 10 0 ## lateblast 0 0 30 ## mid2cell 0 12 0 ## midblast 0 0 60 ## zy 0 4 0 10.2.2 Example 2. Graph-based clustering (segerstolpe dataset) muraro &lt;- readRDS(&quot;data/pancreas/muraro.rds&quot;) ## PCA var.fit &lt;- suppressWarnings(trendVar(muraro, parametric=TRUE, use.spikes=F)) muraro &lt;- suppressWarnings(denoisePCA(muraro, technical=var.fit$trend)) dim(reducedDim(muraro, &quot;PCA&quot;)) ## [1] 2126 5 ## Build graph and clustering gr &lt;- buildSNNGraph(muraro, use.dimred=&quot;PCA&quot;, k = 30) cl &lt;- igraph::cluster_louvain(gr)$membership colData(muraro)$cl &lt;- factor(cl) mclust::adjustedRandIndex(colData(muraro)$cell_type1, colData(muraro)$cl) ## [1] 0.4845618 table(muraro$cell_type1, cl) ## cl ## 1 2 3 4 5 6 7 8 9 ## acinar 0 0 0 0 0 0 218 0 1 ## alpha 202 306 274 5 15 9 1 0 0 ## beta 1 0 0 5 195 21 2 220 4 ## delta 0 0 0 0 18 174 0 1 0 ## ductal 0 0 0 215 0 1 7 3 19 ## endothelial 0 0 0 0 0 0 0 0 21 ## epsilon 0 0 0 0 0 3 0 0 0 ## gamma 1 0 1 0 0 97 2 0 0 ## mesenchymal 0 0 0 1 0 0 0 0 79 ## unclear 0 0 0 4 0 0 0 0 0 10.2.3 Example 3. SC3 Let’s run SC3 clustering on the Deng data. The advantage of the SC3 is that it can directly ingest a SingleCellExperiment object. SC3 can estimate a number of clusters: deng &lt;- sc3_estimate_k(deng) ## Estimating k... metadata(deng)$sc3$k_estimation ## [1] 6 Next we run SC3 (we also ask it to calculate biological properties of the clusters): deng &lt;- sc3(deng, ks = 10, biology = TRUE, n_cores = 1) ## Setting SC3 parameters... ## Calculating distances between the cells... ## Performing transformations and calculating eigenvectors... ## Performing k-means clustering... ## Calculating consensus matrix... ## Calculating biology... SC3 result consists of several different outputs (please look in (Kiselev et al. 2017) and SC3 vignette for more details). Here we show some of them: Consensus matrix: sc3_plot_consensus(deng, k = 10, show_pdata = &quot;cell_type2&quot;) Silhouette plot: sc3_plot_silhouette(deng, k = 10) Heatmap of the expression matrix: sc3_plot_expression(deng, k = 10, show_pdata = &quot;cell_type2&quot;) Identified marker genes: sc3_plot_markers(deng, k = 10, show_pdata = &quot;cell_type2&quot;) PCA plot with highlighted SC3 clusters: plotPCA(deng, colour_by = &quot;sc3_10_clusters&quot;) Compare the results of SC3 clustering with the original publication cell type labels: adjustedRandIndex(colData(deng)$cell_type2, colData(deng)$sc3_10_clusters) ## [1] 0.6581962 Note SC3 can also be run in an interactive Shiny session: sc3_interactive(deng) This command will open SC3 in a web browser. Note Due to direct calculation of distances SC3 becomes very slow when the number of cells is \\(&gt;5000\\). For large datasets containing up to \\(10^5\\) cells we recomment using Seurat (see chapter 16). 10.3 An alternative to clustering: Automatic cell annotation 10.3.1 SingleR 10.3.1.1 Methodology Step1. Find variable gene 1-1. For every gene, obtain median grouped by label. 1-2. Select genes that makes at least one label different:. For example, if we are looking for the genes that makes label “green” different from label “red”, we substract the second column by the first, and pick the top \\(N\\) highest and positive values. All analysis onwards use only the selected variable genes. Step2. Spearman’s correlation Spearman’s correlation \\(\\in [-1, 1]\\) is a measure of the strength of a linear or monotonic relationship between paired data. We compute the Spearman’s correlation for all pairs of cells in the test and reference dataset, and obtain an \\(n_{\\text{test}} \\times n_{\\text{ref}}\\) correlation matrix, where \\(n\\) is the number of cells (see the first matrix in Step3). Step3. Scoring We want to know how each cell in the test data is correlated to the labels in the reference data, instead of each reference cell. So we take the correlations of a cell in the test data with all the cells with a certain label in the reference data, and summarize them into one number or a score, in SingleR, the default is to take the \\(80\\%\\) quantile. Step4. Fine tuning We stop here and assign each cell with label that score the highest, actually, if we set the argument fine.tune = FALSE, that is exactly what the package function SingleR does. But there is one more question, what if the second highest score is very close to the highest? say, 1, 1, 1, 9.5, 10. SingleR set a threshold to define how close is “very close”, the default is 0.05. For (only) the cells that falls into this category, it goes back to Step2. 10.3.1.2 Example (Note: SingleR is not yet available in the released version of Bioconductor. It will be possible to run it as shown once the next Bioconductor release is made in late October.) library(scRNAseq) library(SingleR) segerstolpe &lt;- readRDS(&quot;data/pancreas/segerstolpe.rds&quot;) sceM &lt;- suppressMessages(MuraroPancreasData()) sceM &lt;- sceM[,!is.na(sceM$label)] sceM &lt;- logNormCounts(sceM) ## find common gene rownames(sceM) &lt;- gsub(&quot;__.*&quot;,&quot;&quot;,rownames(sceM)) common &lt;- intersect(rownames(sceM), rownames(segerstolpe)) sceM &lt;- sceM[common,] segerstolpe &lt;- segerstolpe[common,] ## Prepare reference out &lt;- pairwiseTTests(logcounts(sceM), sceM$label, direction=&quot;up&quot;) markers &lt;- getTopMarkers(out$statistics, out$pairs, n=10) ## Annotation pred &lt;- SingleR(test=segerstolpe, ref=sceM, labels=sceM$label, genes=markers) ## View result plotScoreHeatmap(pred, show.labels = TRUE, annotation_col=data.frame( row.names=rownames(pred))) 10.3.2 scmap ## Load data segerstolpe &lt;- readRDS(&quot;data/pancreas/segerstolpe.rds&quot;) # test library(scRNAseq) sceM &lt;- readRDS(&quot;data/pancreas/muraro.rds&quot;) # reference rownames(sceM) &lt;- gsub(&quot;__.*&quot;,&quot;&quot;,rownames(sceM)) Select the most informative features (genes) using the dropout feature selection method. By default select 500 features. library(scmap) rowData(sceM)$feature_symbol &lt;- rownames(sceM) sceM &lt;- selectFeatures(sceM, suppress_plot = TRUE) Index of a reference dataset is created by finding the median gene expression for each cluster. First, chop the total of 500 features into \\(M = 50\\) chuncks/ low-dimensional subspace. Second, cluster each chunk into \\(k = \\sqrt{N}\\) clusters, where \\(N\\) is the number of cells. By default scmap uses the cell_type1 column of the colData slot in the reference to identify clusters. sceM &lt;- indexCell(sceM) The function indexCluster writes the scmap_cluster_index item of the meta data slot of the reference dataset sceM. This step has two outputs: names(metadata(sceM)$scmap_cell_index) ## [1] &quot;subcentroids&quot; &quot;subclusters&quot; subcentroids returns cluster centers: cat(length(metadata(sceM)$scmap_cell_index$subcentroids), &quot; chunks \\n&quot;) ## 50 chunks cat(&quot;The dimension of cluster centers in each chunk: &quot;, dim(metadata(sceM)$scmap_cell_index$subcentroids[[1]]), &quot;\\n&quot;) ## The dimension of cluster centers in each chunk: 10 46 subclusters contains information about which cluster (label) the cells belong to dim(metadata(sceM)$scmap_cell_index$subclusters) ## [1] 50 2126 metadata(sceM)$scmap_cell_index$subclusters[1:5,1:5] ## D28.1_1 D28.1_13 D28.1_15 D28.1_17 D28.1_2 ## [1,] 13 25 13 6 38 ## [2,] 23 46 36 25 11 ## [3,] 38 44 20 16 43 ## [4,] 18 36 32 4 33 ## [5,] 2 44 21 21 10 Projection: Once the scmap-cell indexes have been generated we can use them to project the test dataset. scmapCell_results &lt;- scmapCell( projection = segerstolpe, index_list = list( sceM = metadata(sceM)$scmap_cell_index ) ) names(scmapCell_results) ## [1] &quot;sceM&quot; The cells matrix contains the top 10 (scmap default) cell IDs of the cells of the reference dataset that a given cell of the projection dataset is closest to: dim(scmapCell_results$sceM$cells) ## [1] 10 3514 Cell annotation: If cell cluster annotation is available for the reference datasets, scmap-cell can also annotate the cells from the projection dataset using the labels of the reference. It does so by looking at the top 3 nearest neighbours (scmap default) and if they all belong to the same cluster in the reference and their maximum similarity is higher than a threshold (0.5 is the scmap default), then a projection cell is assigned to the corresponding reference cluster: scmapCell_clusters &lt;- scmapCell2Cluster( scmapCell_results, list( colData(sceM)$cell_type1 )) Plot result Compare the annotated result with the original label in the segerstolpe dataset. plot( getSankey( segerstolpe$cell_type1, scmapCell_clusters$combined_labs, plot_height = 400 ) ) 10.3.3 sessionInfo() References "],
-["trajectory-inference.html", "11 Trajectory inference 11.1 First look at Deng data 11.2 TSCAN 11.3 Slingshot 11.4 Monocle 11.5 Other methods 11.6 Comparison of the methods 11.7 Expression of genes through time", " 11 Trajectory inference library(SingleCellExperiment) library(TSCAN) library(M3Drop) library(monocle) library(destiny) library(scater) library(ggplot2) library(ggthemes) library(ggbeeswarm) library(corrplot) library(Polychrome) library(slingshot) library(SLICER) library(ouija) set.seed(1) In many situations, one is studying a process where cells change continuously. This includes, for example, many differentiation processes taking place during development: following a stimulus, cells will change from one cell-type to another. Ideally, we would like to monitor the expression levels of an individual cell over time. Unfortunately, such monitoring is not possible with scRNA-seq since the cell is lysed (destroyed) when the RNA is extracted. Instead, we must sample at multiple time-points and obtain snapshots of the gene expression profiles. Since some of the cells will proceed faster along the differentiation than others, each snapshot may contain cells at varying points along the developmental progression. We use statistical methods to order the cells along one or more trajectories which represent the underlying developmental trajectories, this ordering is referred to as “pseudotime”. In this chapter we will consider five different tools: TSCAN,Slingshot,Monocle and some off-the-shelf methods like PCA, for ordering cells according to their pseudotime development. To illustrate the methods we will be using a dataset on mouse embryonic development (Deng et al. 2014). The dataset consists of 268 cells from 10 different time-points of early mouse development. In this case, there is no need for pseudotime alignment since the cell labels provide information about the development trajectory. Thus, the labels allow us to establish a ground truth so that we can evaluate and compare the different methods. A recent benchmarking paper by Saelens et al (Saelens et al. 2019) provides a detailed summary of the various computational methods for trajectory inference from single-cell transcriptomics (Saelens et al. 2019). They discuss 45 tools and evaluate them across various aspects including accuracy, scalability, and usability. The following figures from the paper summarise several key aspects and some of the features of the tools being evaluated: Figure 2.3: Overview of several key aspects of the evaluation (Fig. 1 from Saelens et al, 2019). The Characterizatics of the 45 TI tools: Figure 2.4: Characterization of trajectory inference methods for single-cell transcriptomics data (Fig. 2 from Saelens et al, 2019). The detailed evaluation results of the 45 TI tools: Figure 2.5: Detailed results of the four main evaluation criteria: accuracy, scalability, stability and usability of trajectory inference methods for single-cell transcriptomics data (Fig. 3 from Saelens et al, 2019). 11.1 First look at Deng data Let us take a first look at the Deng(Deng et al. 2014) data, without yet applying sophisticated pseudotime methods. As the plot below shows, simple PCA does a very good job of displaying the structure in these data. It is only once we reach the blast cell types (“earlyblast”, “midblast”, “lateblast”) that PCA struggles to separate the distinct cell types. deng_SCE &lt;- readRDS(&quot;data/deng/deng-reads.rds&quot;) deng_SCE$cell_type2 &lt;- factor( deng_SCE$cell_type2, levels = c(&quot;zy&quot;, &quot;early2cell&quot;, &quot;mid2cell&quot;, &quot;late2cell&quot;, &quot;4cell&quot;, &quot;8cell&quot;, &quot;16cell&quot;, &quot;earlyblast&quot;, &quot;midblast&quot;, &quot;lateblast&quot;) ) cellLabels &lt;- deng_SCE$cell_type2 deng &lt;- counts(deng_SCE) colnames(deng) &lt;- cellLabels deng_SCE &lt;- scater::runPCA(deng_SCE,ncomponent = 5) ## change color Palette with library(Polychrome) set.seed(723451) # for reproducibility my_color &lt;- createPalette(10, c(&quot;#010101&quot;, &quot;#ff0000&quot;), M=1000) names(my_color) &lt;- unique(as.character(deng_SCE$cell_type2)) pca_df &lt;- data.frame(PC1 = reducedDim(deng_SCE,&quot;PCA&quot;)[,1], PC2 = reducedDim(deng_SCE,&quot;PCA&quot;)[,2], cell_type2 = deng_SCE$cell_type2) ggplot(data = pca_df)+geom_point(mapping = aes(x = PC1, y = PC2, colour = cell_type2))+ scale_colour_manual(values = my_color)+theme_classic() PCA, here, provides a useful baseline for assessing different pseudotime methods. For a very naive pseudotime we can just take the co-ordinates of the first principal component. #deng_SCE$PC1 &lt;- reducedDim(deng_SCE, &quot;PCA&quot;)[,1] ggplot(pca_df, aes(x = PC1, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + scale_colour_manual(values = my_color) + theme_classic() + xlab(&quot;First principal component&quot;) + ylab(&quot;Timepoint&quot;) + ggtitle(&quot;Cells ordered by first principal component&quot;) As the plot above shows, PC1 struggles to correctly order cells early and late in the developmental timecourse, but overall does a relatively good job of ordering cells by developmental time. Can bespoke pseudotime methods do better than naive application of PCA? 11.2 TSCAN TSCAN (Ji and Ji 2019) combines clustering with pseudotime analysis. First it clusters the cells using mclust, which is based on a mixture of normal distributions. Then it builds a minimum spanning tree to connect the clusters. The branch of this tree that connects the largest number of clusters is the main branch which is used to determine pseudotime. Note From a connected graph with weighted edges, MST is the tree structure that connects all the nodes in a way that has the minimum total edge weight. The trajectory inference methods that use MST is based on the idea that nodes (cells/clusters of cells) and their connections represent the geometric shape of the data cloud in a two-dimenension space. First we will try to use all genes to order the cells. procdeng &lt;- TSCAN::preprocess(counts(deng_SCE)) colnames(procdeng) &lt;- 1:ncol(deng_SCE) dengclust &lt;- TSCAN::exprmclust(procdeng, clusternum = 10) TSCAN::plotmclust(dengclust) dengorderTSCAN &lt;- TSCAN::TSCANorder(dengclust, orderonly = FALSE) pseudotime_order_tscan &lt;- as.character(dengorderTSCAN$sample_name) deng_SCE$pseudotime_order_tscan &lt;- NA deng_SCE$pseudotime_order_tscan[as.numeric(dengorderTSCAN$sample_name)] &lt;- dengorderTSCAN$Pseudotime Frustratingly, TSCAN only provides pseudotime values for 221 of 268 cells, silently returning missing values for non-assigned cells. Again, we examine which timepoints have been assigned to each state: cellLabels[dengclust$clusterid == 10] ## [1] late2cell late2cell late2cell late2cell late2cell late2cell late2cell ## [8] late2cell late2cell late2cell ## 10 Levels: zy early2cell mid2cell late2cell 4cell 8cell ... lateblast ggplot(as.data.frame(colData(deng_SCE)), aes(x = pseudotime_order_tscan, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + scale_color_manual(values = my_color) + theme_classic() + xlab(&quot;TSCAN pseudotime&quot;) + ylab(&quot;Timepoint&quot;) + ggtitle(&quot;Cells ordered by TSCAN pseudotime&quot;) TSCAN gets the development trajectory the “wrong way around”, in the sense that later pseudotime values correspond to early timepoints and vice versa. This is not inherently a problem (it is easy enough to reverse the ordering to get the intuitive interpretation of pseudotime), but overall it would be a stretch to suggest that TSCAN performs better than PCA on this dataset. (As it is a PCA-based method, perhaps this is not entirely surprising.) Exercise 1 Compare results for different numbers of clusters (clusternum). 11.3 Slingshot Slingshot (Street et al. 2018) is a single-cell lineage inference tool, it can work with datasets with multiple branches. Slingshot has two stages: 1) the inference of the global lineage structure using MST on clustered data points and 2) the inference of pseudotime variables for cells along each lineage by fitting simultaneous ‘principal curves’ across multiple lineages. Slingshot’s first stage uses a cluster-based MST to stably identify the key elements of the global lineage structure, i.e., the number of lineages and where they branch. This allows us to identify novel lineages while also accommodating the use of domain-specific knowledge to supervise parts of the tree (e.g., terminal cellular states). For the second stage, we propose a novel method called simultaneous principal curves, to fit smooth branching curves to these lineages, thereby translating the knowledge of global lineage structure into stable estimates of the underlying cell-level pseudotime variable for each lineage. Slingshot had consistently performing well across different datasets as reported by Saelens et al, let’s have a run for the deng dataset. It is recommended by Slingshot to run in a reduced dimensions. __Note_ Principal curves are smooth one-dimensional curves that pass through the middle of a p-dimensional data set, providing a nonlinear summary of the data. They are nonparametric, and their shape is suggested by the data (Hastie et al)(Hastie and Stuetzle 1989). ## runing slingshot deng_SCE &lt;- slingshot(deng_SCE, clusterLabels = &#39;cell_type2&#39;,reducedDim = &quot;PCA&quot;, allow.breaks = FALSE) ## Using diagonal covariance matrix summary(deng_SCE$slingPseudotime_1) ## Min. 1st Qu. Median Mean 3rd Qu. Max. NA&#39;s ## 0.00 52.19 59.81 60.34 81.60 85.72 55 ## get lineages inferred by slingshot lnes &lt;- getLineages(reducedDim(deng_SCE,&quot;PCA&quot;), deng_SCE$cell_type2) ## Using diagonal covariance matrix lnes@lineages ## $Lineage1 ## [1] &quot;zy&quot; &quot;early2cell&quot; &quot;mid2cell&quot; &quot;late2cell&quot; &quot;4cell&quot; ## [6] &quot;16cell&quot; &quot;midblast&quot; &quot;earlyblast&quot; ## ## $Lineage2 ## [1] &quot;zy&quot; &quot;early2cell&quot; &quot;mid2cell&quot; &quot;late2cell&quot; &quot;4cell&quot; ## [6] &quot;16cell&quot; &quot;midblast&quot; &quot;lateblast&quot; ## ## $Lineage3 ## [1] &quot;zy&quot; &quot;early2cell&quot; &quot;mid2cell&quot; &quot;late2cell&quot; &quot;4cell&quot; ## [6] &quot;16cell&quot; &quot;8cell&quot; ## plot the lineage overlay on the orginal PCA plot plot(reducedDims(deng_SCE)$PCA, col = my_color[as.character(deng_SCE$cell_type2)], pch=16, asp = 1) legend(&quot;bottomleft&quot;,legend = names(my_color[levels(deng_SCE$cell_type2)]), fill = my_color[levels(deng_SCE$cell_type2)]) lines(SlingshotDataSet(deng_SCE), lwd=2, type = &#39;lineages&#39;, col = c(&quot;black&quot;)) ## Plotting the pseudotime inferred by slingshot by cell types slingshot_df &lt;- data.frame(colData(deng_SCE)) ggplot(slingshot_df, aes(x = slingPseudotime_1, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + theme_classic() + xlab(&quot;First Slingshot pseudotime&quot;) + ylab(&quot;cell type&quot;) + ggtitle(&quot;Cells ordered by Slingshot pseudotime&quot;)+scale_colour_manual(values = my_color) ggplot(slingshot_df, aes(x = slingPseudotime_2, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + theme_classic() + xlab(&quot;Second Slingshot pseudotime&quot;) + ylab(&quot;cell type&quot;) + ggtitle(&quot;Cells ordered by Slingshot pseudotime&quot;)+scale_colour_manual(values = my_color) ggplot(slingshot_df, aes(x = slingPseudotime_1, y = slingPseudotime_2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + theme_classic() + xlab(&quot;First Slingshot pseudotime&quot;) + ylab(&quot;Second Slingshot pseudotime&quot;) + ggtitle(&quot;Cells ordered by Slingshot pseudotime&quot;)+scale_colour_manual(values = my_color) # # ggplot(slingshot_df, aes(x = slingPseudotime_1, y = slingPseudotime_2, # colour = slingPseudotime_3)) + # geom_point() + theme_classic() + # xlab(&quot;First Slingshot pseudotime&quot;) + ylab(&quot;Second Slingshot pseudotime&quot;) + # ggtitle(&quot;Cells ordered by Slingshot pseudotime&quot;)+facet_wrap(.~cell_type2) Note You can also supply a start and an end cluster to slingshot. Comments Did you notice the ordering of clusters in the lineage prediced for 16cells state? There is an outlier-like cell in the 16cell group, find the outlier and remove it, then re-run Slingshot. 11.3.1 GAM general additive model for identifying temporally expressed genes After running slingshot, an interesting next step may be to find genes that change their expression over the course of development. We demonstrate one possible method for this type of analysis on the 100 most variable genes. We will regress each gene on the pseudotime variable we have generated, using a general additive model (GAM). This allows us to detect non-linear patterns in gene expression. library(gam) t &lt;- deng_SCE$slingPseudotime_1 # for time, only look at the 100 most variable genes Y &lt;- log1p(assay(deng_SCE,&quot;logcounts&quot;)) var100 &lt;- names(sort(apply(Y,1,var),decreasing = TRUE))[1:100] Y &lt;- Y[var100,] # fit a GAM with a loess term for pseudotime gam.pval &lt;- apply(Y,1,function(z){ d &lt;- data.frame(z=z, t=t) suppressWarnings({ tmp &lt;- gam(z ~ lo(t), data=d) }) p &lt;- summary(tmp)[3][[1]][2,3] p }) ## Plot the top 100 genes&#39; expression topgenes &lt;- names(sort(gam.pval, decreasing = FALSE))[1:100] heatdata &lt;- assays(deng_SCE)$logcounts[topgenes, order(t, na.last = NA)] heatclus &lt;- deng_SCE$cell_type2[order(t, na.last = NA)] heatmap(heatdata, Colv = NA, ColSideColors = my_color[heatclus],cexRow = 1,cexCol = 1) We will regress each gene on the pseudotime variable we have generated, using a general additive model (GAM). This allows us to detect non-linear patterns in gene expression. 11.4 Monocle The original Monocle (Trapnell et al. 2014) method skips the clustering stage of TSCAN and directly builds a minimum spanning tree on a reduced dimension representation (using ‘ICA’) of the cells to connect all cells. Monocle then identifies the longest path in this tree as the main branch and uses this to determine pseudotime. Priors are required such as start/end state and the number of branching events. If the data contains diverging trajectories (i.e. one cell type differentiates into two different cell-types), monocle can identify these. Each of the resulting forked paths is defined as a separate cell state. 11.4.1 Monocle 2 Monocle 2 (Qiu et al. 2017) uses a different approach, with dimensionality reduction and ordering performed by reverse graph embedding (RGE), allowing it to detect branching events in an unsupervised manner. RGE, a machine-learning strategy, learns a ‘principal graph’ to describe the single-cell dataset. RGE also learns the mapping function of data points on the trajectory back to the original high dimentional space simutaneously. In doing so, it aims to position the latent points in the lower dimension space (along the trajectory) while also ensuring their corresponding positions in the input dimension are ‘neighbors’. There are different ways of implementing the RGE framework, Monocle 2 uses DDRTree(Discriminative dimensionality reduction via learning a tree) by default. DDRTree learns latent points and the projection of latent points to the points in original input space, which is equivalent to “dimension reduction”. In addition, it simutanously learns ‘principal graph’ for K-means soft clustered cetroids for the latent points. Principal graph is the spanning tree of those centroids. DDRTree returns a principal tree of the centroids of cell clusters in low dimension, pseudotime is derived for individual cells by calculating geomdestic distance of their projections onto the tree from the root (user-defined or arbitrarily assigned). Note Informally, a principal graph is like a principal curve which passes through the ‘middle’ of a data set but is allowed to have branches. library(monocle) #d &lt;- deng_SCE[m3dGenes,] ## feature selection deng &lt;- counts(deng_SCE) m3dGenes &lt;- as.character( M3DropFeatureSelection(deng)$Gene ) d &lt;- deng_SCE[which(rownames(deng_SCE) %in% m3dGenes), ] d &lt;- d[!duplicated(rownames(d)), ] colnames(d) &lt;- 1:ncol(d) geneNames &lt;- rownames(d) rownames(d) &lt;- 1:nrow(d) pd &lt;- data.frame(timepoint = cellLabels) pd &lt;- new(&quot;AnnotatedDataFrame&quot;, data=pd) fd &lt;- data.frame(gene_short_name = geneNames) fd &lt;- new(&quot;AnnotatedDataFrame&quot;, data=fd) dCellData &lt;- newCellDataSet(counts(d), phenoData = pd, featureData = fd) # dCellData &lt;- setOrderingFilter(dCellData, which(geneNames %in% m3dGenes)) dCellData &lt;- estimateSizeFactors(dCellData) dCellDataSet &lt;- reduceDimension(dCellData,reduction_method = &quot;DDRTree&quot;, pseudo_expr = 1) dCellDataSet &lt;- orderCells(dCellDataSet, reverse = FALSE) plot_cell_trajectory(dCellDataSet) # Store the ordering pseudotime_monocle2 &lt;- data.frame( Timepoint = phenoData(dCellDataSet)$timepoint, pseudotime = phenoData(dCellDataSet)$Pseudotime, State = phenoData(dCellDataSet)$State ) rownames(pseudotime_monocle2) &lt;- 1:ncol(d) pseudotime_order_monocle &lt;- rownames(pseudotime_monocle2[order(pseudotime_monocle2$pseudotime), ]) Note check other available methods for ?reduceDimension We can again compare the inferred pseudotime to the known sampling timepoints. deng_SCE$pseudotime_monocle2 &lt;- pseudotime_monocle2$pseudotime ggplot(as.data.frame(colData(deng_SCE)), aes(x = pseudotime_monocle2, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + scale_color_manual(values = my_color) + theme_classic() + xlab(&quot;monocle2 pseudotime&quot;) + ylab(&quot;Timepoint&quot;) + ggtitle(&quot;Cells ordered by monocle2 pseudotime&quot;) Monocle 2 performs pretty well on these cells. 11.4.2 Monocle 3 Monocle3(Cao et al. 2019) is the updated single-cell analysis toolkit for analysing large datasets. Monocle 3 is designed for use with absolute transcript counts (e.g. from UMI experiments). It first does dimension reduction with UMAP and then clusters the cells with Louvian/Leiden algorithms and merge adjacent groups into supergroup, and finaly resovles the trajectories individual cells can take during development, identifies the locations of branches and convergences within each supergroup. In short, Monocle3 uses UMAP to construct a initial trajectory inference and refines it with learning principal graph. It builds KNN graph in the UMAP dimensions and runs Louvain/Leiden algorithms om the KNN graph to derive communities; edges are drawn to connect communities that have more links (Partitioned Approximate Graph Abstraction (PAGA) graph). Each component of the PAGA grah is passed to the next step which is learning principal graph based on the SimplePPT algorithm. The pseudotime is calculated for individual cells by projecting the cells to their nearest point on the principal graph edge and measure geodesic distance along of principal points to the closest of their root nodes. library(monocle3) ## ## Attaching package: &#39;monocle3&#39; ## The following objects are masked from &#39;package:monocle&#39;: ## ## plot_genes_in_pseudotime, plot_genes_violin, ## plot_pc_variance_explained ## The following objects are masked from &#39;package:Biobase&#39;: ## ## exprs, fData, fData&lt;-, pData, pData&lt;- gene_meta &lt;- rowData(deng_SCE) #gene_metadata must contain a column verbatim named &#39;gene_short_name&#39; for certain functions. gene_meta$gene_short_name &lt;- rownames(gene_meta) cds &lt;- new_cell_data_set(expression_data = counts(deng_SCE), cell_metadata = colData(deng_SCE), gene_metadata = gene_meta) ## Step 1: Normalize and pre-process the data cds &lt;- preprocess_cds(cds,num_dim = 5) plot_pc_variance_explained(cds) ## Step 3: Reduce the dimensions using UMAP cds &lt;- reduce_dimension(cds) ## No preprocess_method specified, using preprocess_method = &#39;PCA&#39; ## Step 4: Cluster the cells cds &lt;- cluster_cells(cds) ## change the clusters ## cds@clusters$UMAP$clusters &lt;- deng_SCE$cell_type2 ## Step 5: Learn a graph cds &lt;- learn_graph(cds,use_partition = TRUE) ## Step 6: Order cells cds &lt;- order_cells(cds, root_cells = c(&quot;zy&quot;,&quot;zy.1&quot;,&quot;zy.2&quot;,&quot;zy.3&quot;) ) plot_cells(cds, color_cells_by=&quot;cell_type2&quot;, graph_label_size = 4, cell_size = 2, group_label_size = 6)+ scale_color_manual(values = my_color) plot_cells(cds, graph_label_size = 6, cell_size = 1, color_cells_by=&quot;pseudotime&quot;, group_label_size = 6) ## Cells aren&#39;t colored in a way that allows them to be grouped. pdata_cds &lt;- pData(cds) pdata_cds$pseudotime_monocle3 &lt;- monocle3::pseudotime(cds) ggplot(as.data.frame(pdata_cds), aes(x = pseudotime_monocle3, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + scale_color_manual(values = my_color) + theme_classic() + xlab(&quot;monocle3 pseudotime&quot;) + ylab(&quot;Timepoint&quot;) + ggtitle(&quot;Cells ordered by monocle3 pseudotime&quot;) deng_SCE$pseudotime_monocle3 &lt;- pdata_cds$pseudotime_monocle3 It did not work well for our small Smart-seq2 dataset. 11.4.3 Diffusion maps Diffusion maps were introduced by Ronald Coifman and Stephane Lafon(Coifman and Lafon 2006), and the underlying idea is to assume that the data are samples from a diffusion process. The method infers the low-dimensional manifold by estimating the eigenvalues and eigenvectors for the diffusion operator related to the data. Angerer et al(Angerer et al. 2016) have applied the diffusion maps concept to the analysis of single-cell RNA-seq data to create an R package called destiny. We will take the ranko prder of cells in the first diffusion map component as “diffusion map pseudotime” here. deng &lt;- logcounts(deng_SCE) colnames(deng) &lt;- cellLabels dm &lt;- DiffusionMap(t(deng)) tmp &lt;- data.frame(DC1 = eigenvectors(dm)[,1], DC2 = eigenvectors(dm)[,2], Timepoint = deng_SCE$cell_type2) ggplot(tmp, aes(x = DC1, y = DC2, colour = Timepoint)) + geom_point() + scale_color_manual(values = my_color) + xlab(&quot;Diffusion component 1&quot;) + ylab(&quot;Diffusion component 2&quot;) + theme_classic() deng_SCE$pseudotime_diffusionmap &lt;- rank(eigenvectors(dm)[,1]) ggplot(as.data.frame(colData(deng_SCE)), aes(x = pseudotime_diffusionmap, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + scale_color_manual(values = my_color) + theme_classic() + xlab(&quot;Diffusion map pseudotime (first diffusion map component)&quot;) + ylab(&quot;Timepoint&quot;) + ggtitle(&quot;Cells ordered by diffusion map pseudotime&quot;) Like the other methods, using the first diffusion map component from destiny as pseudotime does a good job at ordering the early time-points (if we take high values as “earlier” in developement), but it is unable to distinguish the later ones. Exercise 2 Do you get a better resolution between the later time points by considering additional eigenvectors? Exercise 3 How does the ordering change if you only use the genes identified by M3Drop? 11.5 Other methods 11.5.1 SLICER The SLICER(Welch, Hartemink, and Prins 2016) method is an algorithm for constructing trajectories that describe gene expression changes during a sequential biological process, just as Monocle and TSCAN are. SLICER is designed to capture highly nonlinear gene expression changes, automatically select genes related to the process, and detect multiple branch and loop features in the trajectory (Welch, Hartemink, and Prins 2016). The SLICER R package is available from its GitHub repository and can be installed from there using the devtools package. We use the select_genes function in SLICER to automatically select the genes to use in builing the cell trajectory. The function uses “neighbourhood variance” to identify genes that vary smoothly, rather than fluctuating randomly, across the set of cells. Following this, we determine which value of “k” (number of nearest neighbours) yields an embedding that most resembles a trajectory. Then we estimate the locally linear embedding of the cells. library(&quot;lle&quot;) slicer_genes &lt;- select_genes(t(deng)) k &lt;- select_k(t(deng[slicer_genes,]), kmin = 30, kmax=60) ## finding neighbours ## calculating weights ## computing coordinates ## finding neighbours ## calculating weights ## computing coordinates ## finding neighbours ## calculating weights ## computing coordinates ## finding neighbours ## calculating weights ## computing coordinates ## finding neighbours ## calculating weights ## computing coordinates ## finding neighbours ## calculating weights ## computing coordinates ## finding neighbours ## calculating weights ## computing coordinates slicer_traj_lle &lt;- lle(t(deng[slicer_genes,]), m = 2, k)$Y ## finding neighbours ## calculating weights ## computing coordinates reducedDim(deng_SCE, &quot;LLE&quot;) &lt;- slicer_traj_lle plot_df &lt;- data.frame(slicer1 = reducedDim(deng_SCE, &quot;LLE&quot;)[,1], slicer2 = reducedDim(deng_SCE, &quot;LLE&quot;)[,2], cell_type2 = deng_SCE$cell_type2) ggplot(data = plot_df)+geom_point(mapping = aes(x = slicer1, y = slicer2, color = cell_type2))+ scale_color_manual(values = my_color)+ xlab(&quot;LLE component 1&quot;) + ylab(&quot;LLE component 2&quot;) + ggtitle(&quot;Locally linear embedding of cells from SLICER&quot;)+ theme_classic() With the locally linear embedding computed we can construct a k-nearest neighbour graph that is fully connected. This plot displays a (yellow) circle for each cell, with the cell ID number overlaid in blue. Here we show the graph computed using 10 nearest neighbours. Here, SLICER appears to detect one major trajectory with one branch. slicer_traj_graph &lt;- conn_knn_graph(slicer_traj_lle, 10) plot(slicer_traj_graph, main = &quot;Fully connected kNN graph from SLICER&quot;) From this graph we can identify “extreme” cells that are candidates for start/end cells in the trajectory. ends &lt;- find_extreme_cells(slicer_traj_graph, slicer_traj_lle) start &lt;- ends[1] Having defined a start cell we can order the cells in the estimated pseudotime. pseudotime_order_slicer &lt;- cell_order(slicer_traj_graph, start) branches &lt;- assign_branches(slicer_traj_graph, start) pseudotime_slicer &lt;- data.frame( Timepoint = cellLabels, pseudotime = NA, State = branches ) pseudotime_slicer$pseudotime[pseudotime_order_slicer] &lt;- 1:length(pseudotime_order_slicer) deng_SCE$pseudotime_slicer &lt;- pseudotime_slicer$pseudotime We can again compare the inferred pseudotime to the known sampling timepoints. SLICER does not provide a pseudotime value per se, just an ordering of cells. ggplot(as.data.frame(colData(deng_SCE)), aes(x = pseudotime_slicer, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + scale_color_manual(values = my_color) + theme_classic() + xlab(&quot;SLICER pseudotime (cell ordering)&quot;) + ylab(&quot;Timepoint&quot;) + theme_classic() Like the previous method, SLICER (Welch, Hartemink, and Prins 2016) here provides a good ordering for the early time points. It places “16cell” cells before “8cell” cells, but provides better ordering for blast cells than many of the earlier methods. Exercise 4 How do the results change for different k? (e.g. k = 5) What about changing the number of nearest neighbours in the call to conn_knn_graph? Exercise 5 How does the ordering change if you use a different set of genes from those chosen by SLICER (e.g. the genes identified by M3Drop)? 11.5.2 Ouija Ouija (http://kieranrcampbell.github.io/ouija/) takes a different approach from the pseudotime estimation methods we have looked at so far. Earlier methods have all been “unsupervised”, which is to say that apart from perhaps selecting informative genes we do not supply the method with any prior information about how we expect certain genes or the trajectory as a whole to behave. Ouija, in contrast, is a probabilistic framework that allows for interpretable learning of single-cell pseudotimes using only small panels of marker genes. This method: infers pseudotimes from a small number of marker genes letting you understand why the pseudotimes have been learned in terms of those genes; provides parameter estimates (with uncertainty) for interpretable gene regulation behaviour (such as the peak time or the upregulation time); has a Bayesian hypothesis test to find genes regulated before others along the trajectory; identifies metastable states, ie discrete cell types along the continuous trajectory. We will supply the following marker genes to Ouija (with timepoints where they are expected to be highly expressed): Early timepoints: Dazl, Rnf17, Sycp3, Nanog, Pou5f1, Fgf8, Egfr, Bmp5, Bmp15 Mid timepoints: Zscan4b, Foxa1, Prdm14, Sox21 Late timepoints: Creb3, Gpx4, Krt8, Elf5, Eomes, Cdx2, Tdgf1, Gdf3 With Ouija we can model genes as either exhibiting monotonic up or down regulation (known as switch-like behaviour), or transient behaviour where the gene briefly peaks. By default, Ouija assumes all genes exhibit switch-like behaviour (the authors assure us not to worry if we get it wrong - the noise model means incorrectly specifying a transient gene as switch-like has minimal effect). Here we can “cheat” a little and check that our selected marker genes do actually identify different timepoints of the differentiation process. ouija_markers_down &lt;- c(&quot;Dazl&quot;, &quot;Rnf17&quot;, &quot;Sycp3&quot;, &quot;Fgf8&quot;, &quot;Egfr&quot;, &quot;Bmp5&quot;, &quot;Bmp15&quot;, &quot;Pou5f1&quot;) ouija_markers_up &lt;- c(&quot;Creb3&quot;, &quot;Gpx4&quot;, &quot;Krt8&quot;, &quot;Elf5&quot;, &quot;Cdx2&quot;, &quot;Tdgf1&quot;, &quot;Gdf3&quot;, &quot;Eomes&quot;) ouija_markers_transient &lt;- c(&quot;Zscan4b&quot;, &quot;Foxa1&quot;, &quot;Prdm14&quot;, &quot;Sox21&quot;) ouija_markers &lt;- c(ouija_markers_down, ouija_markers_up, ouija_markers_transient) plotExpression(deng_SCE, ouija_markers, x = &quot;cell_type2&quot;, colour_by = &quot;cell_type2&quot;) + theme(axis.text.x = element_text(angle = 60, hjust = 1)) In order to fit the pseudotimes wesimply call ouija, passing in the expected response types. Note that if no response types are provided then they are all assumed to be switch-like by default, which we will do here. The input to Ouija can be a cell-by-gene matrix of non-negative expression values, or an ExpressionSet object, or, happily, by selecting the logcounts values from a SingleCellExperiment object. We can apply prior information about whether genes are up- or down-regulated across the differentiation process, and also provide prior information about when the switch in expression or a peak in expression is likely to occur. We can fit the Ouija model using either: Hamiltonian Monte Carlo (HMC) - full MCMC inference where gradient information of the log-posterior is used to “guide” the random walk through the parameter space, or Automatic Differentiation Variational Bayes (ADVI or simply VI) - approximate inference where the KL divergence to an approximate distribution is minimised. In general, HMC will provide more accurate inference with approximately correct posterior variance for all parameters. However, VB is orders of magnitude quicker than HMC and while it may underestimate posterior variance, the Ouija authors suggest that anecdotally it often performs as well as HMC for discovering posterior pseudotimes. To help the Ouija model, we provide it with prior information about the strength of switches for up- and down-regulated genes. By setting switch strength to -10 for down-regulated genes and 10 for up-regulated genes with a prior strength standard deviation of 0.5 we are telling the model that we are confident about the expected behaviour of these genes across the differentiation process. options(mc.cores = parallel::detectCores()) response_type &lt;- c(rep(&quot;switch&quot;, length(ouija_markers_down) + length(ouija_markers_up)), rep(&quot;transient&quot;, length(ouija_markers_transient))) switch_strengths &lt;- c(rep(-10, length(ouija_markers_down)), rep(10, length(ouija_markers_up))) switch_strength_sd &lt;- c(rep(0.5, length(ouija_markers_down)), rep(0.5, length(ouija_markers_up))) garbage &lt;- capture.output( oui_vb &lt;- ouija(deng_SCE[ouija_markers,], single_cell_experiment_assay = &quot;logcounts&quot;, response_type = response_type, switch_strengths = switch_strengths, switch_strength_sd = switch_strength_sd, inference_type = &quot;vb&quot;) ) print(oui_vb) ## A Ouija fit with 268 cells and 20 marker genes ## Inference type: Variational Bayes ## (Gene behaviour) Switch/transient: 16 / 4 We can plot the gene expression over pseudotime along with the maximum a posteriori (MAP) estimates of the mean function (the sigmoid or Gaussian transient function) using the plot_expression function. plot_expression(oui_vb) We can also visualise when in the trajectory gene regulation behaviour occurs, either in the form of the switch time or the peak time (for switch-like or transient genes) using the plot_switch_times and plot_transient_times functions: plot_switch_times(oui_vb) plot_peak_times(oui_vb) Identify metastable states using consistency matrices. cmo &lt;- consistency_matrix(oui_vb) plot_consistency(oui_vb) cell_classifications &lt;- cluster_consistency(cmo) map_pst &lt;- map_pseudotime(oui_vb) ouija_pseudotime &lt;- data.frame(map_pst, cell_classifications) ggplot(ouija_pseudotime, aes(x = map_pst, y = cell_classifications)) + geom_point() + xlab(&quot;MAP pseudotime&quot;) + ylab(&quot;Cell classification&quot;) deng_SCE$pseudotime_ouija &lt;- ouija_pseudotime$map_pst deng_SCE$ouija_cell_class &lt;- ouija_pseudotime$cell_classifications ggplot(as.data.frame(colData(deng_SCE)), aes(x = pseudotime_ouija, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + scale_color_manual(values = my_color) + theme_classic() + xlab(&quot;Ouija pseudotime&quot;) + ylab(&quot;Timepoint&quot;) + theme_classic() Ouija does quite well in the ordering of the cells here, although it can be sensitive to the choice of marker genes and prior information supplied. How do the results change if you select different marker genes or change the priors? Ouija identifies four metastable states here, which we might annotate as “zygote/2cell”, “4/8/16 cell”, “blast1” and “blast2”. ggplot(as.data.frame(colData(deng_SCE)), aes(x = as.factor(ouija_cell_class), y = pseudotime_ouija, colour = cell_type2)) + geom_boxplot() + coord_flip() + scale_color_manual(values = my_color) + theme_classic() + xlab(&quot;Ouija cell classification&quot;) + ylab(&quot;Ouija pseudotime&quot;) + theme_classic() A common analysis is to work out the regulation orderings of genes. For example, is gene A upregulated before gene B? Does gene C peak before the downregulation of gene D? Ouija answers these questions in terms of a Bayesian hypothesis test of whether the difference in regulation timing (either switch time or peak time) is significantly different to 0. This is collated using the gene_regulation function. gene_regs &lt;- gene_regulation(oui_vb) head(gene_regs) ## # A tibble: 6 x 7 ## # Groups: label, gene_A [6] ## label gene_A gene_B mean_difference lower_95 upper_95 significant ## &lt;chr&gt; &lt;chr&gt; &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;lgl&gt; ## 1 Bmp15 - Cdx2 Bmp15 Cdx2 -0.0631 -0.109 -0.0133 TRUE ## 2 Bmp15 - Creb3 Bmp15 Creb3 0.269 0.201 0.321 TRUE ## 3 Bmp15 - Elf5 Bmp15 Elf5 -0.678 -0.718 -0.644 TRUE ## 4 Bmp15 - Eomes Bmp15 Eomes 0.0822 0.00272 0.156 TRUE ## 5 Bmp15 - Foxa1 Bmp15 Foxa1 -0.0211 -0.0508 0.0120 FALSE ## 6 Bmp15 - Gdf3 Bmp15 Gdf3 0.0644 0.0163 0.126 TRUE What conclusions can you draw from the gene regulation output from Ouija? If you have time, you might try the HMC inference method and see if that changes the Ouija results in any way. 11.6 Comparison of the methods How do the trajectories inferred by TSCAN, Monocle, Diffusion Map, SLICER and Ouija compare? TSCAN and Diffusion Map methods get the trajectory the “wrong way round”, so we’ll adjust that for these comparisons. df_pseudotime &lt;- as.data.frame( colData(deng_SCE)[, grep(&quot;pseudotime&quot;, colnames(colData(deng_SCE)))] ) colnames(df_pseudotime) &lt;- gsub(&quot;pseudotime_&quot;, &quot;&quot;, colnames(df_pseudotime)) df_pseudotime$PC1 &lt;- reducedDim(deng_SCE,&quot;PCA&quot;)[,1] df_pseudotime$order_tscan &lt;- -df_pseudotime$order_tscan #df_pseudotime$diffusionmap &lt;- df_pseudotime$diffusionmap df_pseudotime$slingshot1 &lt;- colData(deng_SCE)$slingPseudotime_1 corrplot.mixed(cor(df_pseudotime, use = &quot;na.or.complete&quot;), order = &quot;hclust&quot;, tl.col = &quot;black&quot;, main = &quot;Correlation matrix for pseudotime results&quot;, mar = c(0, 0, 3.1, 0)) We see here that Ouija, TSCAN and SLICER all give trajectories that are similar and strongly correlated with PC1. Diffusion Map is less strongly correlated with these methods, and Monocle gives very different results. 11.7 Expression of genes through time Each package also enables the visualization of expression through pseudotime. Following individual genes is very helpful for identifying genes that play an important role in the differentiation process. We illustrate the procedure using the Nanog gene. We have added the pseudotime values computed with all methods here to the colData slot of an SCE object. Having done that, the full plotting capabilities of the scater package can be used to investigate relationships between gene expression, cell populations and pseudotime. This is particularly useful for the packages such as SLICER that do not provide plotting functions. Principal components deng_SCE$PC1 &lt;- reducedDim(deng_SCE,&quot;PCA&quot;)[,1] plotExpression(deng_SCE, &quot;Nanog&quot;, x = &quot;PC1&quot;, colour_by = &quot;cell_type2&quot;, show_violin = FALSE, show_smooth = TRUE) TSCAN plotExpression(deng_SCE, &quot;Nanog&quot;, x = &quot;pseudotime_order_tscan&quot;, colour_by = &quot;cell_type2&quot;, show_violin = FALSE, show_smooth = TRUE) Monocle plotExpression(deng_SCE, &quot;Nanog&quot;, x = &quot;pseudotime_monocle2&quot;, colour_by = &quot;cell_type2&quot;, show_violin = FALSE, show_smooth = TRUE) Diffusion Map plotExpression(deng_SCE, &quot;Nanog&quot;, x = &quot;pseudotime_diffusionmap&quot;, colour_by = &quot;cell_type2&quot;, show_violin = FALSE, show_smooth = TRUE) SLICER plotExpression(deng_SCE, &quot;Nanog&quot;, x = &quot;pseudotime_slicer&quot;, colour_by = &quot;cell_type2&quot;, show_violin = FALSE, show_smooth = TRUE) Ouija plotExpression(deng_SCE, &quot;Nanog&quot;, x = &quot;pseudotime_ouija&quot;, colour_by = &quot;cell_type2&quot;, show_violin = FALSE, show_smooth = TRUE) Q: How many of these methods outperform the naive approach of using the first principal component to represent pseudotime for these data? Exercise 7: Repeat the exercise using a subset of the genes, e.g. the set of highly variable genes that can be obtained using one of the methods discussed in the Feature Selection chapter. 11.7.1 dynverse https://dynverse.org/users/2-quick_start/ library(dyno) library(tidyverse) # Reproduces the guidelines as created in the shiny app answers &lt;- dynguidelines::answer_questions( multiple_disconnected = FALSE, expect_topology = TRUE, expected_topology = &quot;linear&quot;, n_cells = 3000, n_features = 10000, memory = &quot;100GB&quot;, docker = FALSE ) guidelines &lt;- dynguidelines::guidelines(answers = answers) guidelines deng_dataset &lt;- wrap_expression( counts = counts(deng_SCE), expression = assay(deng_SCE,&quot;logcounts&quot;) ) model &lt;- infer_trajectory(deng_dataset, first(guidelines$methods_selected)) ## Loading required namespace: hdf5r model &lt;- model %&gt;% add_dimred(dyndimred::dimred_mds, expression_source = deng_dataset$expression) plot_dimred( model, expression_source = deng_dataset$expression, grouping = deng_SCE$cell_type2 ) 11.7.2 sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] splines parallel stats4 stats graphics grDevices utils ## [8] datasets methods base ## ## other attached packages: ## [1] rstan_2.19.2 StanHeaders_2.19.0 ## [3] lle_1.1 snowfall_1.84-6.1 ## [5] snow_0.4-3 MASS_7.3-51.1 ## [7] scatterplot3d_0.3-41 monocle3_0.2.0 ## [9] gam_1.16.1 foreach_1.4.7 ## [11] ouija_0.99.0 Rcpp_1.0.2 ## [13] SLICER_0.2.0 slingshot_1.2.0 ## [15] princurve_2.1.4 Polychrome_1.2.3 ## [17] corrplot_0.84 ggbeeswarm_0.6.0 ## [19] ggthemes_4.2.0 scater_1.12.2 ## [21] destiny_2.14.0 monocle_2.12.0 ## [23] DDRTree_0.1.5 irlba_2.3.3 ## [25] VGAM_1.1-1 ggplot2_3.2.1 ## [27] Matrix_1.2-17 M3Drop_1.10.0 ## [29] numDeriv_2016.8-1.1 TSCAN_1.22.0 ## [31] SingleCellExperiment_1.6.0 SummarizedExperiment_1.14.1 ## [33] DelayedArray_0.10.0 BiocParallel_1.18.1 ## [35] matrixStats_0.55.0 Biobase_2.44.0 ## [37] GenomicRanges_1.36.1 GenomeInfoDb_1.20.0 ## [39] IRanges_2.18.3 S4Vectors_0.22.1 ## [41] BiocGenerics_0.30.0 ## ## loaded via a namespace (and not attached): ## [1] rgl_0.100.30 rsvd_1.0.2 ## [3] vcd_1.4-4 Hmisc_4.2-0 ## [5] zinbwave_1.6.0 corpcor_1.6.9 ## [7] ps_1.3.0 class_7.3-15 ## [9] lmtest_0.9-37 glmnet_2.0-18 ## [11] crayon_1.3.4 laeken_0.5.0 ## [13] nlme_3.1-139 backports_1.1.4 ## [15] qlcMatrix_0.9.7 rlang_0.4.0 ## [17] XVector_0.24.0 readxl_1.3.1 ## [19] callr_3.3.2 limma_3.40.6 ## [21] phylobase_0.8.6 smoother_1.1 ## [23] manipulateWidget_0.10.0 bit64_0.9-7 ## [25] loo_2.1.0 glue_1.3.1 ## [27] pheatmap_1.0.12 rngtools_1.4 ## [29] splancs_2.01-40 processx_3.4.1 ## [31] vipor_0.4.5 AnnotationDbi_1.46.1 ## [33] haven_2.1.1 tidyselect_0.2.5 ## [35] rio_0.5.16 XML_3.98-1.20 ## [37] tidyr_1.0.0 zoo_1.8-6 ## [39] xtable_1.8-4 magrittr_1.5 ## [41] evaluate_0.14 bibtex_0.4.2 ## [43] cli_1.1.0 zlibbioc_1.30.0 ## [45] rstudioapi_0.10 miniUI_0.1.1.1 ## [47] sp_1.3-1 rpart_4.1-15 ## [49] locfdr_1.1-8 RcppEigen_0.3.3.5.0 ## [51] shiny_1.3.2 BiocSingular_1.0.0 ## [53] xfun_0.9 leidenbase_0.1.0 ## [55] inline_0.3.15 pkgbuild_1.0.5 ## [57] cluster_2.1.0 caTools_1.17.1.2 ## [59] sgeostat_1.0-27 tibble_2.1.3 ## [61] ggrepel_0.8.1 ape_5.3 ## [63] stabledist_0.7-1 zeallot_0.1.0 ## [65] withr_2.1.2 bitops_1.0-6 ## [67] slam_0.1-45 ranger_0.11.2 ## [69] plyr_1.8.4 cellranger_1.1.0 ## [71] pcaPP_1.9-73 sparsesvd_0.2 ## [73] coda_0.19-3 e1071_1.7-2 ## [75] RcppParallel_4.4.3 pillar_1.4.2 ## [77] gplots_3.0.1.1 reldist_1.6-6 ## [79] kernlab_0.9-27 TTR_0.23-5 ## [81] ellipsis_0.3.0 tripack_1.3-8 ## [83] DelayedMatrixStats_1.6.1 xts_0.11-2 ## [85] vctrs_0.2.0 NMF_0.21.0 ## [87] tools_3.6.0 foreign_0.8-70 ## [89] rncl_0.8.3 beeswarm_0.2.3 ## [91] munsell_0.5.0 proxy_0.4-23 ## [93] HSMMSingleCell_1.4.0 compiler_3.6.0 ## [95] abind_1.4-5 httpuv_1.5.2 ## [97] pkgmaker_0.27 GenomeInfoDbData_1.2.1 ## [99] gridExtra_2.3 edgeR_3.26.8 ## [101] lattice_0.20-38 deldir_0.1-23 ## [103] utf8_1.1.4 later_0.8.0 ## [105] dplyr_0.8.3 jsonlite_1.6 ## [107] scales_1.0.0 docopt_0.6.1 ## [109] carData_3.0-2 genefilter_1.66.0 ## [111] lazyeval_0.2.2 promises_1.0.1 ## [113] spatstat_1.61-0 car_3.0-3 ## [115] doParallel_1.0.15 latticeExtra_0.6-28 ## [117] R.utils_2.9.0 goftest_1.1-1 ## [119] spatstat.utils_1.13-0 checkmate_1.9.4 ## [121] cowplot_1.0.0 rmarkdown_1.15 ## [123] openxlsx_4.1.0.1 statmod_1.4.32 ## [125] webshot_0.5.1 Rtsne_0.15 ## [127] forcats_0.4.0 copula_0.999-19.1 ## [129] softImpute_1.4 uwot_0.1.4 ## [131] igraph_1.2.4.1 HDF5Array_1.12.2 ## [133] survival_2.43-3 yaml_2.2.0 ## [135] htmltools_0.3.6 memoise_1.1.0 ## [137] locfit_1.5-9.1 viridisLite_0.3.0 ## [139] digest_0.6.21 assertthat_0.2.1 ## [141] mime_0.7 densityClust_0.3 ## [143] registry_0.5-1 RSQLite_2.1.2 ## [145] data.table_1.12.2 blob_1.2.0 ## [147] R.oo_1.22.0 RNeXML_2.3.0 ## [149] labeling_0.3 fastICA_1.2-2 ## [151] Formula_1.2-3 Rhdf5lib_1.6.1 ## [153] RCurl_1.95-4.12 hms_0.5.1 ## [155] rhdf5_2.28.0 colorspace_1.4-1 ## [157] base64enc_0.1-3 nnet_7.3-12 ## [159] ADGofTest_0.3 mclust_5.4.5 ## [161] bookdown_0.13 RANN_2.6.1 ## [163] mvtnorm_1.0-11 fansi_0.4.0 ## [165] pspline_1.0-18 VIM_4.8.0 ## [167] R6_2.4.0 grid_3.6.0 ## [169] lifecycle_0.1.0 acepack_1.4.1 ## [171] zip_2.0.4 curl_4.2 ## [173] gdata_2.18.0 robustbase_0.93-5 ## [175] howmany_0.3-1 RcppAnnoy_0.0.13 ## [177] RColorBrewer_1.1-2 MCMCglmm_2.29 ## [179] iterators_1.0.12 alphahull_2.2 ## [181] stringr_1.4.0 htmlwidgets_1.3 ## [183] polyclip_1.10-0 purrr_0.3.2 ## [185] crosstalk_1.0.0 mgcv_1.8-28 ## [187] tensorA_0.36.1 htmlTable_1.13.2 ## [189] clusterExperiment_2.4.4 codetools_0.2-16 ## [191] FNN_1.1.3 gtools_3.8.1 ## [193] prettyunits_1.0.2 gridBase_0.4-7 ## [195] RSpectra_0.15-0 R.methodsS3_1.7.1 ## [197] gtable_0.3.0 DBI_1.0.0 ## [199] highr_0.8 tensor_1.5 ## [201] httr_1.4.1 KernSmooth_2.23-15 ## [203] stringi_1.4.3 progress_1.2.2 ## [205] reshape2_1.4.3 uuid_0.1-2 ## [207] cubature_2.0.3 annotate_1.62.0 ## [209] viridis_0.5.1 xml2_1.2.2 ## [211] combinat_0.0-8 bbmle_1.0.20 ## [213] boot_1.3-20 BiocNeighbors_1.2.0 ## [215] ade4_1.7-13 DEoptimR_1.0-8 ## [217] bit_1.1-14 spatstat.data_1.4-0 ## [219] pkgconfig_2.0.3 gsl_2.1-6 ## [221] knitr_1.25 References "],
+["latent-spaces.html", "9 Latent spaces 9.1 Dimensionality reduction 9.2 Matrix factorization and factor analysis 9.3 Autoencoders", " 9 Latent spaces In many cases we may like to think of cells sitting in a low-dimensional, “latent” space that captures relationships between cells more intuitively than the very high-dimensional gene expression space. library(scater) library(SingleCellExperiment) library(glmpca) library(ggplot2) library(Polychrome) library(slalom) 9.1 Dimensionality reduction Why? - Reduce Curse of Dimensionality problems - Increase storage and computational efficiency - Visualize Data in 2D or 3D Difficulty: Need to decide how many dimension to keep. 9.1.1 PCA: Principal component analysis 9.1.1.1 (traditional) PCA PCA is a linear feature extraction technique. It performs a linear mapping of the data to a lower-dimensional space in such a way that the variance of the data in the low-dimensional representation is maximized. It does so by calculating the eigenvectors from the covariance matrix. The eigenvectors that correspond to the largest eigenvalues (the principal components) are used to reconstruct a significant fraction of the variance of the original data. In simpler terms, PCA combines your input features in a specific way that you can drop the least important feature while still retaining the most valuable parts of all of the features. As an added benefit, each of the new features or components created after PCA are all independent of one another. 9.1.1.1.1 Basic ideas of PCA Idea1: Dropping dimensions = Projection onto lower dimensional space     Which dimension should we keep? Idea2: more variantion = more information     But what if the plot is not readily to be projected onto either X or Y-axis? 9.1.1.1.2 Steps of PCA Step1: Rotation We want a set of axises (called Principle Components) that satisfies: -The 1st axis points to the direction where variantion is maximized, and so on -They are orthogonal to each other It can be shown that the eigen vectors of the covariance matrix satisfy these conditions, and the eigen vector according to the largest eigen value accounts for the most variation. Step2: Projection (3-dimesion \\(\\rightarrow\\) 2-dimension) 9.1.1.1.3 An example of PCA deng &lt;- readRDS(&quot;data/deng/deng-reads.rds&quot;) my_color1 &lt;- createPalette(6, c(&quot;#010101&quot;, &quot;#ff0000&quot;), M=1000) names(my_color1) &lt;- unique(as.character(deng$cell_type1)) my_color2 &lt;- createPalette(10, c(&quot;#010101&quot;, &quot;#ff0000&quot;), M=1000) names(my_color2) &lt;- unique(as.character(deng$cell_type2)) deng &lt;- runPCA(deng, ncomponents = 2) plotPCA(deng, colour_by = &quot;cell_type1&quot;) + scale_fill_manual(values = my_color1) plotPCA(deng, colour_by = &quot;cell_type2&quot;) + scale_fill_manual(values = my_color2) 9.1.1.1.4 Advantages and limits of PCA: Advantages: fast, easy to use and intuitive. Limits: Can lead to local inconsistency, i.e. far away points can become nearest neighbours. It is a linear projection, like casting a shadow, meaning it can’t capture non-linear dependencies. For instance, PCA would not be able to “unroll” the following structure.  9.1.1.2 GLM-PCA (Collins, Dasgupta, and Schapire 2002) (Townes et al. 2019) GLM-PCA is a generalized version of the traditional PCA. The traditional PCA implicitly imposes an assumption of Gaussian distribution. The purpose of GLM-PCA is to loosen this condition to accommodate other distributions of the exponential family. Why does PCA assume a Gaussian distribution? Let \\(x_1, \\dots, x_n \\in \\mathcal{R}^d\\) be the \\(d\\)-dimensional data observed. PCA is looking for their projections onto a subspace: \\(u_1, \\dots, u_n\\), such that \\(\\sum_{i = 1}^n \\Vert x_i - u_i\\Vert^2\\) is minimized. This objective function can be interpretated in two ways: Interpretation 1: the variance of the projections/principal components: \\(\\sum_{i} \\Vert u_i \\Vert ^2\\), if the data is centered at the origin (\\(\\sum_{i} x_i = 0\\)); Interpretation 2: Each point \\(x_i\\) is thought of as a random draw from a probability distribution centered at \\(u_i\\). If we take this probability as a unit Gaussian, that is \\(x_i \\sim N(u_i, 1)\\), then the likelihood is \\(\\prod_{i = 1}^n \\exp (- \\Vert x_i - u_i\\Vert^2)\\), and the negative log likelihood is exactly the objective function. This assumption is often inappropriate for non-Gaussian distributed data, for example discrete data. Therefore, GLM-PCA generalizes the Gaussian likelihood into a likelihood of any exponential-family distribution, and applies appropriate link functions to \\(u_i\\)’s in the same as a GLM does to non-Gaussian responses. The following example compares GLM-PCA with Poisson marginals to the traditional PCA, which is identical to the result from plotPCA. ## GLM-PCA Y &lt;- assay(deng, &quot;counts&quot;) Y &lt;- Y[rowSums(Y) &gt; 0, ] system.time(res1 &lt;- glmpca(Y, L=2, fam=&quot;poi&quot;, verbose=TRUE)) ## user system elapsed ## 84.147 22.275 106.448 pd1 &lt;- data.frame(res1$factors, dimreduce=&quot;glmpca-poisson&quot;, clust = factor(deng$cell_type2)) ## traditional PCA pd2 &lt;- data.frame(reducedDim(deng, &quot;PCA&quot;), dimreduce=&quot;runPCA&quot;, clust = factor(deng$cell_type2)) colnames(pd2) &lt;- colnames(pd1) ## plot pd &lt;- rbind(pd1, pd2) ggplot(pd, aes(x = dim1, y = dim2, colour = clust)) + geom_point(size=2) + facet_wrap(~dimreduce, scales=&quot;free&quot;, nrow=3) + scale_color_manual(values = my_color2) + theme_bw() Let us compare GLM-PCA and standard PCA (using normalized log-counts data) on the Tung data, before cells have been QC’d. Repeat these plots with the QC’d Tung data. 9.1.2 tSNE: t-Distributed Stochastic Neighbor Embedding t-SNE (Maaten and Hinton 2008) is an advanced version of the original SNE algorithm. (Hinton and Roweis 2003) 9.1.2.1 Motivation The weakness of PCA is the motivation behind the SNE algorithm. PCA focuses on global covariance structrue, which lead to local inconsistency. SNE aims to preserve local strucutrue, or preserving the relationships among data points (ie. similar points remain similar; distinct points remain distinct). Unlike PCA, SNE is not limited to linear projections, which makes it suited to all sorts of datasets, including the swiss-roll data we have seen above. t-SNE solves the crowding issue of the original SNE. 9.1.2.2 original SNE SNE minimizes the divergence between two distributions: a distribution that measures pairwise similarities of the input objects and a distribution that measures pairwise similarities of the corresponding low-dimensional points in the embedding. Goal: preserve neighbourhoods. Soft neighbourhood: For each data point \\(x_i\\), the \\(i\\rightarrow j\\) probability is the probability that point \\(x_i\\) chooses \\(x_j\\) as its neighbour: \\(p_{j|i} \\propto \\exp(-\\Vert x_i - x_j \\Vert^2/2\\delta^2)\\). (This can be thought of as the probability of \\(x_j\\) in \\(N(x_i, \\delta)\\)) \\(\\Vert x_i - x_j \\Vert^2\\) is the Euclidean distance: The closer \\(x_i\\) and \\(x_j\\) are, the larger \\(p_{j|i}\\) is. \\(\\delta^2\\) denotes the vairance, it sets the size of the neighbourhood.   Very low \\(\\Rightarrow\\) all the probability is in the nearest neighbour   Very high \\(\\Rightarrow\\) uniform weights We generally want \\(\\delta^2\\) to be small for points in densely populated areas and large for sparse areas, so that the number of neighbours of all data points are roughly the same. It is computed with a user specified parameter (perplexity) which indicates the effective number of neighbours for a data point. Similarity matrix Collect \\(p_{j|i}\\) for all data points into a matrix, then this matrix preserves the key information of the local neighbourhood structure. How SNE works: Given high-dimensional data \\(X = \\{x_1, \\dots, x_n \\in \\mathcal{R}^d \\}\\), obtain the similarity matrix \\(P\\); Let \\(Y =\\{y_1, \\dots, y_n \\in \\mathcal{R}^2\\}\\) be a 2-dimensional data, the coordinates for visualization. Obtain a smilarity matrix of \\(Y\\), denoted as \\(Q\\), in the same way as \\(X\\), except that \\(\\delta^2\\) is fixed at 1/2. Look for \\(Y\\) such that \\(Q\\) is as similar to \\(P\\) as possible. Measurement of how similar two distributions is: Kullback-Leibler divergence (The definition of this cost function and the optimization procedure are out of the scope of this course) 9.1.2.3 t-SNE The motivation of t-SNE is to solve one of the main issues of SNE, the crowding problem. Crowding problem: In high dimension we have more room, points can have a lot of different neighbours that are far apart from each other. But in low dimensions, we don’t have enough room to accommodate all neighbours. Forexample, in 2D a point can have a few neighbors at distance one all far from each other - what happens when we embed in 1D? Solution: Change the distribution of the low-dimensional data \\(Q\\) into a student-t distribution. Recall that SNE is trying to minimize the dissimilarity of \\(P\\) and \\(Q\\), and \\(P\\) has a Gaussian distribution. So for a pair of points (\\(x_i\\) and \\(x_j\\) in high-dimension, \\(y_i\\) and \\(y_j\\) in low-dimension) to reach the same probability, the distance between \\(y_i\\) and \\(y_j\\) would be much larger (i.e. much farther apart). 9.1.2.4 Example of t-SNE:  muraro &lt;- readRDS(&quot;data/pancreas/muraro.rds&quot;) tmp &lt;- runTSNE(muraro, perplexity = 3) plotTSNE(tmp, colour_by = &quot;cell_type1&quot;) tmp &lt;- runTSNE(muraro, perplexity = 50) plotTSNE(tmp, colour_by = &quot;cell_type1&quot;) 9.1.2.5 Limits of t-SNE: Not a convex problem, i.e. the cost function has multiple local minima. Non-deterministic. Require specification of very important parameters, e.g. perplexity. Coordinates after embedding have no meaning. Therefore can merely be used for visualization. (See here for more pitfalls of using t-SNE.) 9.1.3 Manifold methods 9.1.3.1 UMAP: Uniform Manifold Approximation and Projection (McInnes, Healy, and Melville 2018) 9.1.3.1.1 Advantages of UMAP over t-SNE: faster deterministic better at preserving clusters 9.1.3.1.2 High level description Construct a topological presentation of the high-dimensional data (in this case a weighted \\(k\\)-NN graph) Given a low-dimensional data, construct a graph in the similar way Minimize the dissimilarity between the twp graphs. (Look for the low-dimensional data whose graph is the closest to that of the high-dimensional data) 9.1.3.1.3 Some details How the weighted graph is built? Obtain dissimilarity from the input distance: For each data point \\(x_i\\), find its \\(k\\) nearest neighbours: \\(x_{i_1}, \\dots, x_{i_k}\\). Let \\(d(x_i, x_{i_j})\\) be the input or original distance between \\(x_i\\) and \\(x_{i_j}\\), and \\(\\rho_i = \\min[d(x_i, x_{i_j}); 1 \\leq j \\leq k]\\) be the distance between \\(x_i\\) and its nearest neighbour. Then the dissimilarity between \\(x_i\\) and \\(x_{i_j}\\) is measured simply by subtracting the original distance by \\(\\rho_i\\): \\(\\tilde{d}(x_i, x_{i_j}) = d(x_i, x_{i_j}) - \\rho_i\\). Tranform dissimilarity to similarity: \\(s(x_i, x_{i_j}) = \\exp[-\\tilde{d}(x_i, x_{i_j})] - c_i\\), where \\(c_i\\) is a scale factor to ensure \\(\\sum_{j = 1}^k s(x_i, x_{i_j})\\) is a constant for all \\(i\\). Similarity itself can serve as edge weights, but this similarity is not symmetrical, i.e. \\(s(x_i, x_{i_j}) \\neq s(x_{i_j}, x_i)\\). To be able to project this onto an undirected graph, we need to solve the disagreement between \\(s(x_i, x_{i_j})\\) and \\(s(x_{i_j}, x_i)\\). Obtain weights: \\(w(x_i, x_{i_j}) = s(x_i, x_{i_j}) + s(x_{i_j}, x_i) - s(x_i, x_{i_j}) * s(x_{i_j}, x_i)\\) (Interpretation: \\(P(A \\cup B ) = P(A) + P(B) - P(A)P(B)\\) if \\(A\\) and \\(B\\) are independent) How the dissimilarity between graphs are measured? Cross entropy 9.1.3.1.4 Example of UMAP muraro &lt;- runUMAP(muraro) plotUMAP(muraro, colour_by=&quot;cell_type1&quot;) 9.1.3.2 PHATE (Moon et al. 2017) 9.1.3.2.1 Sketch of algorithm The simpliest description of PHATE: Step1. Create a dissimilarity matrix of the original data Step2. Feed the dissimilarity matrix to nonmetric MDS (MDS: Multi-Dimension Scaling is a classical dimensionality reduction approach, that takes an input of distance matrix, and aims at preserving pairwise distances in the low dimensional space. When the input distance matrix is Euclidean distance, MDS produces the same result as PCA. Nonmetric MDS generalize the input as a dissimilarity matrix, rather than just distance.) Details of step1 in PHATE Step1-1. Markov transition matrix - What is similar with SNE: Recall that in the original SNE algorithm, there is a similarity matrix with entry \\(p_{i|j}\\) that is interpreted as the probability that point \\(x_i\\) chooses \\(x_j\\) as its neighbour: \\(p_{j|i} \\propto \\exp(-\\Vert x_i - x_j \\Vert^2/2\\delta^2)\\). PHATE is doing the same, except that we can interpret it differently: i. We can think \\(p_{j|i}\\) as a Gaussian kernel, where \\(\\epsilon \\triangleq 2\\delta^2\\) is the bandwidth: \\(p_{j|i} \\triangleq K_\\epsilon(x_i, x_j )\\). Similar to SNE, PHATE also define \\(\\epsilon\\) as the \\(k\\)-NN distance of each data point, so that it is smaller in dense area and larger in sparse area. The \\(k\\) is a user-specified tuning parameter, similar to perplexity in SNE. ii. We can think of the similarity matrix as a transition matrix, where \\(p_{j|i}\\) represents the probability of jumping from state \\(i\\) to state \\(j\\) in a single step. - What is different: i. PHATE generalize \\(K_\\epsilon(x_i, x_j)\\) to \\(\\exp \\left(- \\Vert x_i - x_j \\Vert^\\alpha /\\epsilon(x_i)^\\alpha\\right)\\), where the original Gaussian kernel is the special case when \\(\\alpha = 2\\). The motivation is that if the data is very sparse in some regions, then the bandwith \\(\\epsilon\\) with be very large and the kernel will become flat and lose the local information. By letting \\(\\alpha &gt; 2\\), we prevent this to happen, although \\(\\alpha\\) needs to be provided by the user.   ii. Note that the kernels are not symmetrical now, that is \\(K_\\epsilon(x_i, x_j) \\neq K_\\epsilon(x_j, x_i)\\). So we make it symmetrical by taking an average of the two.   Step1-2. Smoothing - \\(P\\) is the transition matrix where \\(p_{i, j}\\) represents the probability of jumping from state \\(i\\) to state \\(j\\) in a single step. - Denote \\(\\delta_x\\) as a row vector of length \\(n\\) (the number of data points), where only the entry corresponding to \\(x\\) is 1 and zero everywhere else. Then \\(p_x = \\delta_x P\\) is the probability distribution of the data points starting from \\(x\\) after one step, and \\(p_x^t = \\delta_x P^t\\) is the probability distribution of the data points after \\(t\\) steps. In general, the more steps we take, the more data points will have positive probabilities. One way to think about this, is the larger \\(t\\) is, the more global information and the less local information is encoded. In the extreme case, if we take infinity steps, \\(p_x^\\infty\\) will be the same for all \\(x\\)’s, i.e. the probability distribution is going to be the same regardless of where we start, in this case, the local information is completely lost. An appropriately chosen \\(t\\) is crucial for the balance between local and global information in the embedding. (See the original paper for details of choosing \\(t\\)) Step1-3. Distance measurement Instead of measuring directly the Eudlicean distance between data points, say \\(x_i\\) and \\(x_j\\), PHATE measures the distance between probability distributions \\(p_{x_i}^t\\) and \\(p_{x_j}^t\\): \\(D^t(x_i, x_j) = \\Vert \\log(p_{x_i}^t) - \\log(p_{x_j}^t) \\Vert^2\\) 9.1.3.2.2 Example of PHATE library(phateR) deng_phate &lt;- phate(t(assay(deng, &quot;logcounts&quot;))) dt &lt;- data.frame(deng_phate$embedding, clust = deng$cell_type1) palette(rainbow(10)) ggplot(dt, aes(x=PHATE1, y=PHATE2, color=clust)) + geom_point() 9.2 Matrix factorization and factor analysis The key concept of factor analysis: The original, observed variables are correlated because they are all associated with some unobservable variables, the latent factors. It looks similar to PCA, but instead of dimensionality reduction, factor analysis focuses on studying the latent factors. The variance of an observed variable can be splitted into two parts: - Common variance: the part of variance that is explained by latent factors; - Unique variance: the part that is specific to only one variable, usually considered as an error component or residual. The factor loadings or weights indicate how much each latent factor is affecting the observed features. 9.2.1 Slalom: Interpretable latent spaces Highlight of Slalom: (Buettner et al. 2017) It incorporates prior information to help the model estimation; It learns whatever not provided by prior knowledge in the model training process; It enforces sparsity in the weight matrix. 9.2.1.1 Methodology Matrix expression of factor analysis: How prior knowledge affects the model: \\(I_{g, k}\\): (observed) Indicator of whether a gene \\(g\\) is annotated to a given pathway or factor \\(k\\); \\(z_{g, k}\\): (latent) Indicator of whether factor \\(k\\) has a regulatory effect on gene \\(g\\); \\(w_{g, k}\\): (estimated) weights. grey arrow: \\[ P(I_{g, k}\\vert z_{g, k}) = \\begin{cases} \\text{Bernoulli}(p_1), \\text{if } z_{g, k} = 1\\\\ \\text{Bernoulli}(p_2), \\text{if } z_{g, k} = 0\\\\ \\end{cases}\\] green arrow: \\[ P(w_{g, k}\\vert z_{g, k}) = \\begin{cases} N(w_{g, k}, 1/\\alpha), \\text{ if } z_{g, k} = 1\\\\ \\delta_0(w_{g, k}), \\text{ if } z_{g, k} = 0\\\\ \\end{cases}\\] We only look at the part of the likelihood that is relavant to this part: \\(\\prod_{g} \\prod_{k}P(I_{g, k}, w_{g, k}, z_{g, k})\\), where \\(P(I_{g, k}, w_{g, k}, z_{g, k}) = P(I_{g, k}, w_{g, k}| z_{g, k})P(z_{g,k}) = P( I_{g, k}| z_{g, k})P( w_{g, k}| z_{g, k})P(z_{g,k})\\). Since we do not know anything about \\(z_{g,k}\\), it is assumed as Bernoulli(1/2). 9.2.1.2 Example First, get a geneset in a GeneSetCollection object. gmtfile &lt;- system.file(&quot;extdata&quot;, &quot;reactome_subset.gmt&quot;, package = &quot;slalom&quot;) genesets &lt;- GSEABase::getGmt(gmtfile) Then we create an Rcpp_SlalomModel object containing the input data and genesets (and subsequent results) for the model. model_deng &lt;- newSlalomModel(deng, genesets, n_hidden = 5, min_genes = 10) ## 29 annotated factors retained; 1 annotated factors dropped. ## 1072 genes retained for analysis. Initialize the model: model_deng &lt;- initSlalom(model_deng, seed = 100) Fit/train the model: model_deng &lt;- trainSlalom(model_deng, nIterations = 1000, seed = 100, tolerance = 0.001) ## pre-training model for faster convergence ## iteration 0 ## Model not converged after 50 iterations. ## iteration 0 ## Model not converged after 50 iterations. ## iteration 0 ## Switched off factor 29 ## Switched off factor 20 ## Switched off factor 32 ## Switched off factor 28 ## Switched off factor 13 ## Switched off factor 27 ## Switched off factor 10 ## iteration 100 ## Switched off factor 22 ## iteration 200 ## iteration 300 ## iteration 400 ## iteration 500 ## iteration 600 ## iteration 700 ## Model converged after 701 iterations. View results: The plotRelevance function displays the most relevant terms (factors/pathways) ranked by relevance, showing gene set size and the number of genes gained/lost as active in the pathway as learnt by the model. plotRelevance(model_deng) + theme_classic(base_size = 8) ## NULL The plotTerms function shows the relevance of all terms in the model, enabling the identification of the most important pathways in the context of all that were included in the model. plotTerms(model_deng) 9.3 Autoencoders (Kingma and Welling 2013) 9.3.1 Background and some notations Data: \\(X\\) Latent variables: \\(Z\\) Something that is not directly observable but is assumed to have an impact on the observed variables. Goal: We believe \\(X\\) can be generated from \\(Z\\) (with some trasformation), and want to sample more data from \\(Z\\) that resembles \\(X\\). So we want the to find the parameters \\(\\theta\\) such that the probability to generate \\(X\\) from the distribution of \\(Z\\): \\(P(X) = \\int P(X|z; \\theta) P(z) dz\\) is maximized. How do we define \\(Z\\)? The simplest idea: \\(Z \\sim N(0, 1)\\). It is not impossible, because “any distribution in d dimensions can be generated by taking a set of d variables that are normally distributed and mapping them through a sufficiently complicated function.” -A better idea: For most of \\(z\\), \\(P(X|z; \\theta)\\) will be close to zero, meaning it contribute almost nothing to the estimate of \\(P(X)\\). Thus, we want to sample only those values of \\(Z\\) that are likely to produce \\(X\\). Denote this distribution of \\(Z\\) as \\(Q(Z|X)\\) (it is infered and therefore depend on \\(X\\)). Advantage: There will be a lot less possible values of \\(Z\\) under \\(Q\\) compared to random sampling, therefore, it will be easier to compute \\(E_{Z \\sim Q} P(X|Z)\\). 9.3.2 Objective \\[ \\log P(X) - KL[Q(Z|X)\\Vert P(Z|X)] = E_{Z\\sim Q}[\\log P(X|Z)] - KL[Q(Z|X)\\Vert P(Z)]\\] We can get this equation by starting from the definition of Kullback-Leibler divergence, combined with the Bayesian formula and a little algebra. (Not showing details here) LHS: what we want to maximize: Generation loss: how likely the generated samples resembles \\(X\\) - an error term which measures how much information is lost when using \\(Q\\) to represent \\(P\\), it becomes small if \\(Q\\) is high-capacity. (A loose explanation of model capacity: Roughly speaking, the capacity of a model describes how complex a relationship it can model. You could expect a model with higher capacity to be able to model more relationships between more variables than a model with a lower capacity.) RHS: what we can maximize through stochastic gradient descent. References "],
+["clustering-and-cell-annotation.html", "10 Clustering and cell annotation 10.1 Clustering Methods 10.2 Clustering example 10.3 An alternative to clustering: Automatic cell annotation", " 10 Clustering and cell annotation 10.1 Clustering Methods Once we have normalized the data and removed confounders we can carry out analyses that are relevant to the biological questions at hand. The exact nature of the analysis depends on the dataset. Nevertheless, there are a few aspects that are useful in a wide range of contexts and we will be discussing some of them in the next few chapters. We will start with the clustering of scRNA-seq data. 10.1.1 Introduction One of the most promising applications of scRNA-seq is de novo discovery and annotation of cell-types based on transcription profiles. Computationally, this is a hard problem as it amounts to unsupervised clustering. That is, we need to identify groups of cells based on the similarities of the transcriptomes without any prior knowledge of the labels. Moreover, in most situations we do not even know the number of clusters a priori. The problem is made even more challenging due to the high level of noise (both technical and biological) and the large number of dimensions (i.e. genes). When working with large datasets, it can often be beneficial to apply some sort of dimensionality reduction method. By projecting the data onto a lower-dimensional sub-space, one is often able to significantly reduce the amount of noise. An additional benefit is that it is typically much easier to visualize the data in a 2 or 3-dimensional subspace. We have already discussed PCA (chapter 6.6.2) and t-SNE (chapter 6.6.2). Challenges in clustering What is the number of clusters k? What defines a good clustering? What is a cell type? Scalability: in the last few years the number of cells in scRNA-seq experiments has grown by several orders of magnitude from ~\\(10^2\\) to ~\\(10^6\\) 10.1.2 Unsupervised clustering methods Three main ingredients of a complete clustering method: Measure of similarity: how do we quantify how close two data points are? Quality function: how do we decide how “good” is a clustering/partition? Algorithm: how to find the clustering whose quality function is optimized? 10.1.2.1 Hierarchical clustering Hierarchical clustering is basically the only type of clustering algorithm that does not seek to optimize a quality function, because it builds a hierarchy of clusters, instead of one single clustering result as the output. There are two types of strategies: - Agglomerative (bottom-up): each observation starts in its own cluster, and pairs of clusters are merged as one moves up the hierarchy. - Divisive (top-down): all observations start in one cluster, and splits are performed recursively as one moves down the hierarchy.   10.1.2.2 k-means clustering Measure of similarity: Euclidean distance Quality function: Within cluster distance Algorithm: Advantage: Fast Drawbacks: - Sensitive to initial clustering - Sensitive to outliers - Need to specify K - Tend to find clusters of similar sizes Tools related to K-means: SC3 10.1.2.3 Graph-based methods Real world networks usualy display big inhomogeneities or community structure. Communities or clusters or modules are groups of vertices which probably share common properties and/or play similar roles whithin the graph. In recent years there has been a lot of interest in detecting communities in networks in various domains. Some of these community detection methods can be applied to scRNA-seq data by building a graph where each vertice represents a cell and (weight of) the edge measures similarity between two cells. Actually, graph-based clustering is the most popular clustering algorithm in scRNA-seq data analysis, and has been reported to have outperformed other clustering methods in many situations (Freytag et al. 2018). 10.1.2.3.1 Why do we want to represent the data as a graph? Memory effectiveness: A (complete) graph can be thought as an alternative expression of similarity matrix. Current methods (discuss later) aim to build sparse graphs, which ease the memory burden. Curse of dimensionality: All data become sparse in high-dimensional space and therefore similarities measured by Euclidean distances etc are generally low between all objects. 10.1.2.3.2 Building a graph Step1: Build an unweighted K-nearest neighbour (KNN) graph Step2: Add weights, and obtain a shared nearest neighbour (SNN) graph There are two ways of adding weights: number and rank. - number: The number of shared nodes between \\(u\\) and \\(v\\), in this case, 3. - rank: A measurement of the closeness to their common nearest neighbours. ((???)) Details of rank : Main idea: The closeness of two people is defined by their closest common friend. For each node, say \\(u\\), we can rank its 5 neighbours according to their closeness to \\(u\\), and we can do the same with \\(v\\). Denote the three shared neighbours as \\(x_1\\), \\(x_2\\) and \\(x_3\\), so rank(\\(x_1, u\\)) = 1 means \\(x_1\\) is the closest neighbour of \\(u\\). The idea is, if \\(x_1\\) is also the closest to \\(v\\), then \\(u\\) and \\(v\\) should have a larger similarity, or weight. So we summarize the overall closeness of \\(x_1\\) with both \\(u\\) and \\(v\\) by taking an average: \\(\\dfrac{1}{2}(\\text{rank}(x_1, u), \\text{rank}(x_1, v))\\). Then we find the one with the largest closeness, \\(s(u, v) = \\min \\left[ \\dfrac{1}{2}(\\text{rank}(x_i, u), \\text{rank}(x_i, v)) \\vert i = 1, 2, 3\\right]\\). The final expression of weight: \\[ w(u, v) = K - s(u, v).\\] 10.1.2.3.3 Quality function (Modularity) Modularity (Newman and Girvan 2004) is not the only quality function for graph-based clustering, but it is one of the first attempts to embed in a compact form many questions including the definition of quality function and null model etc. The idea of modularity: A random graph should not have a cluster structure. The more “quality” a partition has compared to a random graph, the “better” the partition is. Specifically, it is defined by: the quality of a partition on the actual graph \\(-\\) the quality of the same partition on a random graph quality : Sum of the weights within clusters random graph : a copy of the original graph, with some of its properties, but without community structure. The random graph defined by modularity is: each node has the same degree as the original graph. \\[ Q \\propto \\sum_{i, j} A_{i, j} \\delta(i, j) - \\sum_{i, j} \\dfrac{k_i k_j}{2m} \\delta(i, j)\\] \\(A_{i, j}\\): weight between node \\(i\\) and \\(j\\); \\(\\delta(i, j)\\): indicator of whether \\(i\\) and \\(j\\) are in the same cluster; \\(k_i\\): the degree of node \\(i\\) (the sum of weights of all edges connected to \\(i\\)); \\(m\\): the total weight in the all graph. Higher modularity implies better partition: Limits of modularity: (Good, De Montjoye, and Clauset 2010) 1. Resolution limit. Short version: Modularity maximization forces small communities into larger ones. Longer version: For two clusters \\(A\\) and \\(B\\), if \\(k_A k_B &lt; 2m\\) then modularity increases by merging A and B into a single cluster, even if A and B are distinct clusters. 2. Bad, even random partitions may have a high modularity. Networks lack a clear modularity maxima. 10.1.2.3.4 Algorithms : Modularity-based clustering methods implemented in single cell analysis are mostly greedy algorithms, that are very fast, although not the most accurate approaches.     Louvain: (Blondel et al. 2008)     Leiden:(Traag, Waltman, and Eck 2019) Improved Louvain, hybrid of greedy algorithm and sampling technique 10.1.2.3.5 Advantages: -Fast -No need to specify \\(k\\) 10.1.2.3.6 Tools for graph-based clustering: Seurat: Louvain, Leiden, SLM igraph: fast greedy, Louvain, optimal, walktrap, spinglass, infomap 10.1.2.4 Consensus clustering (more robustness, less computational speed) 10.1.2.4.1 Motivation (Two problems of \\(K\\)-means): Problem1: sensitive to initial partitions   Solution:   Run multiple iterations of \\(K\\)-means on different subsamples of the original dataset, with different initail partitions. Problem2: the selection of \\(K\\).   Solution:   Run \\(K\\)-means with a range of \\(K\\)’s. 10.1.2.4.2 Algorithm of consensus clustering (simpliest version): for(k in the range of K){ for(each subsample of the data){ for(iteration in 1:1000){ kmeans(subsample, k) # each iteration means a different initial partition save partition } } return consensus clustering result of k } 10.1.2.4.3 Subsample obtained by dimensional reduction: steps of PCA: i) tranformation of the similarity matrix. ii) ranking eigen vectors according to their accoring eigen values in decreasing order. iii) need to decide how many (\\(d\\)) PC’s or eigenvalues we wants to reduce to. In SC3, i) considers two types of transformation: the one with traditional PCA and the associated graph Laplacian. iii) User may specify a range of \\(d\\), or use the default range suggested by the authors according to their experience with empirical results. 10.1.2.4.4 Consensus clustering (combining multiple clustering results): Step1: Represent each partition as a matrix: Say we partitioned four data points into 2 clusters. Step2: Consensus matrix: Average of all the partitions 10.1.2.4.5 Tools for consensus clustering: SC3 10.2 Clustering example library(pcaMethods) library(SC3) library(scater) library(SingleCellExperiment) library(pheatmap) library(mclust) library(igraph) library(scran) 10.2.1 Example 1. Graph-based clustering (deng dataset) To illustrate clustering of scRNA-seq data, we consider the Deng dataset of cells from developing mouse embryo (Deng et al. 2014). We have preprocessed the dataset and created a SingleCellExperiment object in advance. We have also annotated the cells with the cell types identified in the original publication (it is the cell_type2 column in the colData slot). deng &lt;- readRDS(&quot;data/deng/deng-reads.rds&quot;) First, we build a \\(K\\)-NN graph with a package function from scran. The most important decision of building a graph is the choice of \\(K\\), of which there is no standard rule. In general, we can think of it as an indication of the desired cluster size. If \\(K\\) is too small, a genuine cluster might be split into parts, while if \\(K\\) is too large, clusters might not thoroughly separated. deng5 &lt;- buildSNNGraph(deng, k = 5) deng15 &lt;- buildSNNGraph(deng, k = 15) deng25 &lt;- buildSNNGraph(deng, k = 25) par(mfrow=c(1,3)) plot(deng5, vertex.size = 4, vertex.label = NA) title(&quot;5-NN&quot; ,line = -33, cex.main = 3) plot(deng15, vertex.size = 4, vertex.label = NA) title(&quot;15-NN&quot; ,line = -33, cex.main = 3) plot(deng25, vertex.size = 4, vertex.label = NA) title(&quot;25-NN&quot; ,line = -33, cex.main = 3) Perform Louvain clustering: cl &lt;- igraph::cluster_louvain(deng15)$membership colData(deng)$cl &lt;- factor(cl) mclust::adjustedRandIndex(colData(deng)$cell_type2, colData(deng)$cl) ## [1] 0.4197754 Reaches very high similarity with the labels provided in the original paper. However, it tend to merge small clusters into larger ones. table(deng$cell_type2, cl) ## cl ## 1 2 3 ## 16cell 49 0 1 ## 4cell 0 14 0 ## 8cell 36 0 1 ## early2cell 0 8 0 ## earlyblast 0 0 43 ## late2cell 0 10 0 ## lateblast 0 0 30 ## mid2cell 0 12 0 ## midblast 0 0 60 ## zy 0 4 0 10.2.2 Example 2. Graph-based clustering (segerstolpe dataset) muraro &lt;- readRDS(&quot;data/pancreas/muraro.rds&quot;) ## PCA var.fit &lt;- suppressWarnings(trendVar(muraro, parametric=TRUE, use.spikes=F)) muraro &lt;- suppressWarnings(denoisePCA(muraro, technical=var.fit$trend)) dim(reducedDim(muraro, &quot;PCA&quot;)) ## [1] 2126 5 ## Build graph and clustering gr &lt;- buildSNNGraph(muraro, use.dimred=&quot;PCA&quot;, k = 30) cl &lt;- igraph::cluster_louvain(gr)$membership colData(muraro)$cl &lt;- factor(cl) mclust::adjustedRandIndex(colData(muraro)$cell_type1, colData(muraro)$cl) ## [1] 0.4845618 table(muraro$cell_type1, cl) ## cl ## 1 2 3 4 5 6 7 8 9 ## acinar 0 0 0 0 0 0 218 0 1 ## alpha 202 306 274 5 15 9 1 0 0 ## beta 1 0 0 5 195 21 2 220 4 ## delta 0 0 0 0 18 174 0 1 0 ## ductal 0 0 0 215 0 1 7 3 19 ## endothelial 0 0 0 0 0 0 0 0 21 ## epsilon 0 0 0 0 0 3 0 0 0 ## gamma 1 0 1 0 0 97 2 0 0 ## mesenchymal 0 0 0 1 0 0 0 0 79 ## unclear 0 0 0 4 0 0 0 0 0 10.2.3 Example 3. SC3 Let’s run SC3 clustering on the Deng data. The advantage of the SC3 is that it can directly ingest a SingleCellExperiment object. SC3 can estimate a number of clusters: deng &lt;- sc3_estimate_k(deng) ## Estimating k... metadata(deng)$sc3$k_estimation ## [1] 6 Next we run SC3 (we also ask it to calculate biological properties of the clusters): deng &lt;- sc3(deng, ks = 10, biology = TRUE, n_cores = 1) ## Setting SC3 parameters... ## Calculating distances between the cells... ## Performing transformations and calculating eigenvectors... ## Performing k-means clustering... ## Calculating consensus matrix... ## Calculating biology... SC3 result consists of several different outputs (please look in (Kiselev et al. 2017) and SC3 vignette for more details). Here we show some of them: Consensus matrix: sc3_plot_consensus(deng, k = 10, show_pdata = &quot;cell_type2&quot;) Silhouette plot: sc3_plot_silhouette(deng, k = 10) Heatmap of the expression matrix: sc3_plot_expression(deng, k = 10, show_pdata = &quot;cell_type2&quot;) Identified marker genes: sc3_plot_markers(deng, k = 10, show_pdata = &quot;cell_type2&quot;) PCA plot with highlighted SC3 clusters: plotPCA(deng, colour_by = &quot;sc3_10_clusters&quot;) Compare the results of SC3 clustering with the original publication cell type labels: adjustedRandIndex(colData(deng)$cell_type2, colData(deng)$sc3_10_clusters) ## [1] 0.7804189 Note SC3 can also be run in an interactive Shiny session: sc3_interactive(deng) This command will open SC3 in a web browser. Note Due to direct calculation of distances SC3 becomes very slow when the number of cells is \\(&gt;5000\\). For large datasets containing up to \\(10^5\\) cells we recomment using Seurat (see chapter 16). 10.3 An alternative to clustering: Automatic cell annotation 10.3.1 SingleR 10.3.1.1 Methodology Step1. Find variable gene 1-1. For every gene, obtain median grouped by label. 1-2. Select genes that makes at least one label different:. For example, if we are looking for the genes that makes label “green” different from label “red”, we substract the second column by the first, and pick the top \\(N\\) highest and positive values. All analysis onwards use only the selected variable genes. Step2. Spearman’s correlation Spearman’s correlation \\(\\in [-1, 1]\\) is a measure of the strength of a linear or monotonic relationship between paired data. We compute the Spearman’s correlation for all pairs of cells in the test and reference dataset, and obtain an \\(n_{\\text{test}} \\times n_{\\text{ref}}\\) correlation matrix, where \\(n\\) is the number of cells (see the first matrix in Step3). Step3. Scoring We want to know how each cell in the test data is correlated to the labels in the reference data, instead of each reference cell. So we take the correlations of a cell in the test data with all the cells with a certain label in the reference data, and summarize them into one number or a score, in SingleR, the default is to take the \\(80\\%\\) quantile. Step4. Fine tuning We stop here and assign each cell with label that score the highest, actually, if we set the argument fine.tune = FALSE, that is exactly what the package function SingleR does. But there is one more question, what if the second highest score is very close to the highest? say, 1, 1, 1, 9.5, 10. SingleR set a threshold to define how close is “very close”, the default is 0.05. For (only) the cells that falls into this category, it goes back to Step2. 10.3.1.2 Example (Note: SingleR is not yet available in the released version of Bioconductor. It will be possible to run it as shown once the next Bioconductor release is made in late October.) library(scRNAseq) library(SingleR) segerstolpe &lt;- readRDS(&quot;data/pancreas/segerstolpe.rds&quot;) sceM &lt;- suppressMessages(MuraroPancreasData()) sceM &lt;- sceM[,!is.na(sceM$label)] sceM &lt;- logNormCounts(sceM) ## find common gene rownames(sceM) &lt;- gsub(&quot;__.*&quot;,&quot;&quot;,rownames(sceM)) common &lt;- intersect(rownames(sceM), rownames(segerstolpe)) sceM &lt;- sceM[common,] segerstolpe &lt;- segerstolpe[common,] ## Prepare reference out &lt;- pairwiseTTests(logcounts(sceM), sceM$label, direction=&quot;up&quot;) markers &lt;- getTopMarkers(out$statistics, out$pairs, n=10) ## Annotation pred &lt;- SingleR(test=segerstolpe, ref=sceM, labels=sceM$label, genes=markers) ## View result plotScoreHeatmap(pred, show.labels = TRUE, annotation_col=data.frame( row.names=rownames(pred))) 10.3.2 scmap ## Load data segerstolpe &lt;- readRDS(&quot;data/pancreas/segerstolpe.rds&quot;) # test library(scRNAseq) sceM &lt;- readRDS(&quot;data/pancreas/muraro.rds&quot;) # reference rownames(sceM) &lt;- gsub(&quot;__.*&quot;,&quot;&quot;,rownames(sceM)) Select the most informative features (genes) using the dropout feature selection method. By default select 500 features. library(scmap) rowData(sceM)$feature_symbol &lt;- rownames(sceM) sceM &lt;- selectFeatures(sceM, suppress_plot = TRUE) Index of a reference dataset is created by finding the median gene expression for each cluster. First, chop the total of 500 features into \\(M = 50\\) chuncks/ low-dimensional subspace. Second, cluster each chunk into \\(k = \\sqrt{N}\\) clusters, where \\(N\\) is the number of cells. By default scmap uses the cell_type1 column of the colData slot in the reference to identify clusters. sceM &lt;- indexCell(sceM) The function indexCluster writes the scmap_cluster_index item of the meta data slot of the reference dataset sceM. This step has two outputs: names(metadata(sceM)$scmap_cell_index) ## [1] &quot;subcentroids&quot; &quot;subclusters&quot; subcentroids returns cluster centers: cat(length(metadata(sceM)$scmap_cell_index$subcentroids), &quot; chunks \\n&quot;) ## 50 chunks cat(&quot;The dimension of cluster centers in each chunk: &quot;, dim(metadata(sceM)$scmap_cell_index$subcentroids[[1]]), &quot;\\n&quot;) ## The dimension of cluster centers in each chunk: 10 46 subclusters contains information about which cluster (label) the cells belong to dim(metadata(sceM)$scmap_cell_index$subclusters) ## [1] 50 2126 metadata(sceM)$scmap_cell_index$subclusters[1:5,1:5] ## D28.1_1 D28.1_13 D28.1_15 D28.1_17 D28.1_2 ## [1,] 8 14 5 3 3 ## [2,] 15 40 8 35 4 ## [3,] 7 43 21 1 20 ## [4,] 27 42 34 27 30 ## [5,] 45 38 4 41 36 Projection: Once the scmap-cell indexes have been generated we can use them to project the test dataset. scmapCell_results &lt;- scmapCell( projection = segerstolpe, index_list = list( sceM = metadata(sceM)$scmap_cell_index ) ) names(scmapCell_results) ## [1] &quot;sceM&quot; The cells matrix contains the top 10 (scmap default) cell IDs of the cells of the reference dataset that a given cell of the projection dataset is closest to: dim(scmapCell_results$sceM$cells) ## [1] 10 3514 Cell annotation: If cell cluster annotation is available for the reference datasets, scmap-cell can also annotate the cells from the projection dataset using the labels of the reference. It does so by looking at the top 3 nearest neighbours (scmap default) and if they all belong to the same cluster in the reference and their maximum similarity is higher than a threshold (0.5 is the scmap default), then a projection cell is assigned to the corresponding reference cluster: scmapCell_clusters &lt;- scmapCell2Cluster( scmapCell_results, list( colData(sceM)$cell_type1 )) Plot result Compare the annotated result with the original label in the segerstolpe dataset. plot( getSankey( segerstolpe$cell_type1, scmapCell_clusters$combined_labs, plot_height = 400 ) ) 10.3.3 sessionInfo() References "],
+["trajectory-inference.html", "11 Trajectory inference 11.1 First look at Deng data 11.2 TSCAN 11.3 Slingshot 11.4 Monocle 11.5 Other methods 11.6 Comparison of the methods 11.7 Expression of genes through time", " 11 Trajectory inference library(SingleCellExperiment) library(TSCAN) library(M3Drop) library(monocle) library(destiny) library(scater) library(ggplot2) library(ggthemes) library(ggbeeswarm) library(corrplot) library(Polychrome) library(slingshot) library(SLICER) library(ouija) set.seed(1) In many situations, one is studying a process where cells change continuously. This includes, for example, many differentiation processes taking place during development: following a stimulus, cells will change from one cell-type to another. Ideally, we would like to monitor the expression levels of an individual cell over time. Unfortunately, such monitoring is not possible with scRNA-seq since the cell is lysed (destroyed) when the RNA is extracted. Instead, we must sample at multiple time-points and obtain snapshots of the gene expression profiles. Since some of the cells will proceed faster along the differentiation than others, each snapshot may contain cells at varying points along the developmental progression. We use statistical methods to order the cells along one or more trajectories which represent the underlying developmental trajectories, this ordering is referred to as “pseudotime”. In this chapter we will consider five different tools: TSCAN,Slingshot,Monocle and some off-the-shelf methods like PCA, for ordering cells according to their pseudotime development. To illustrate the methods we will be using a dataset on mouse embryonic development (Deng et al. 2014). The dataset consists of 268 cells from 10 different time-points of early mouse development. In this case, there is no need for pseudotime alignment since the cell labels provide information about the development trajectory. Thus, the labels allow us to establish a ground truth so that we can evaluate and compare the different methods. A recent benchmarking paper by Saelens et al (Saelens et al. 2019) provides a detailed summary of the various computational methods for trajectory inference from single-cell transcriptomics (Saelens et al. 2019). They discuss 45 tools and evaluate them across various aspects including accuracy, scalability, and usability. The following figures from the paper summarise several key aspects and some of the features of the tools being evaluated: Figure 2.3: Overview of several key aspects of the evaluation (Fig. 1 from Saelens et al, 2019). The Characterizatics of the 45 TI tools: Figure 2.4: Characterization of trajectory inference methods for single-cell transcriptomics data (Fig. 2 from Saelens et al, 2019). The detailed evaluation results of the 45 TI tools: Figure 2.5: Detailed results of the four main evaluation criteria: accuracy, scalability, stability and usability of trajectory inference methods for single-cell transcriptomics data (Fig. 3 from Saelens et al, 2019). 11.1 First look at Deng data Let us take a first look at the Deng(Deng et al. 2014) data, without yet applying sophisticated pseudotime methods. As the plot below shows, simple PCA does a very good job of displaying the structure in these data. It is only once we reach the blast cell types (“earlyblast”, “midblast”, “lateblast”) that PCA struggles to separate the distinct cell types. deng_SCE &lt;- readRDS(&quot;data/deng/deng-reads.rds&quot;) deng_SCE$cell_type2 &lt;- factor( deng_SCE$cell_type2, levels = c(&quot;zy&quot;, &quot;early2cell&quot;, &quot;mid2cell&quot;, &quot;late2cell&quot;, &quot;4cell&quot;, &quot;8cell&quot;, &quot;16cell&quot;, &quot;earlyblast&quot;, &quot;midblast&quot;, &quot;lateblast&quot;) ) cellLabels &lt;- deng_SCE$cell_type2 deng &lt;- counts(deng_SCE) colnames(deng) &lt;- cellLabels deng_SCE &lt;- scater::runPCA(deng_SCE,ncomponent = 5) ## change color Palette with library(Polychrome) set.seed(723451) # for reproducibility my_color &lt;- createPalette(10, c(&quot;#010101&quot;, &quot;#ff0000&quot;), M=1000) names(my_color) &lt;- unique(as.character(deng_SCE$cell_type2)) pca_df &lt;- data.frame(PC1 = reducedDim(deng_SCE,&quot;PCA&quot;)[,1], PC2 = reducedDim(deng_SCE,&quot;PCA&quot;)[,2], cell_type2 = deng_SCE$cell_type2) ggplot(data = pca_df)+geom_point(mapping = aes(x = PC1, y = PC2, colour = cell_type2))+ scale_colour_manual(values = my_color)+theme_classic() PCA, here, provides a useful baseline for assessing different pseudotime methods. For a very naive pseudotime we can just take the co-ordinates of the first principal component. #deng_SCE$PC1 &lt;- reducedDim(deng_SCE, &quot;PCA&quot;)[,1] ggplot(pca_df, aes(x = PC1, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + scale_colour_manual(values = my_color) + theme_classic() + xlab(&quot;First principal component&quot;) + ylab(&quot;Timepoint&quot;) + ggtitle(&quot;Cells ordered by first principal component&quot;) As the plot above shows, PC1 struggles to correctly order cells early and late in the developmental timecourse, but overall does a relatively good job of ordering cells by developmental time. Can bespoke pseudotime methods do better than naive application of PCA? 11.2 TSCAN TSCAN (Ji and Ji 2019) combines clustering with pseudotime analysis. First it clusters the cells using mclust, which is based on a mixture of normal distributions. Then it builds a minimum spanning tree to connect the clusters. The branch of this tree that connects the largest number of clusters is the main branch which is used to determine pseudotime. Note From a connected graph with weighted edges, MST is the tree structure that connects all the nodes in a way that has the minimum total edge weight. The trajectory inference methods that use MST is based on the idea that nodes (cells/clusters of cells) and their connections represent the geometric shape of the data cloud in a two-dimenension space. First we will try to use all genes to order the cells. procdeng &lt;- TSCAN::preprocess(counts(deng_SCE)) colnames(procdeng) &lt;- 1:ncol(deng_SCE) dengclust &lt;- TSCAN::exprmclust(procdeng, clusternum = 10) TSCAN::plotmclust(dengclust) dengorderTSCAN &lt;- TSCAN::TSCANorder(dengclust, orderonly = FALSE) pseudotime_order_tscan &lt;- as.character(dengorderTSCAN$sample_name) deng_SCE$pseudotime_order_tscan &lt;- NA deng_SCE$pseudotime_order_tscan[as.numeric(dengorderTSCAN$sample_name)] &lt;- dengorderTSCAN$Pseudotime Frustratingly, TSCAN only provides pseudotime values for 221 of 268 cells, silently returning missing values for non-assigned cells. Again, we examine which timepoints have been assigned to each state: cellLabels[dengclust$clusterid == 10] ## [1] late2cell late2cell late2cell late2cell late2cell late2cell late2cell ## [8] late2cell late2cell late2cell ## 10 Levels: zy early2cell mid2cell late2cell 4cell 8cell ... lateblast ggplot(as.data.frame(colData(deng_SCE)), aes(x = pseudotime_order_tscan, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + scale_color_manual(values = my_color) + theme_classic() + xlab(&quot;TSCAN pseudotime&quot;) + ylab(&quot;Timepoint&quot;) + ggtitle(&quot;Cells ordered by TSCAN pseudotime&quot;) TSCAN gets the development trajectory the “wrong way around”, in the sense that later pseudotime values correspond to early timepoints and vice versa. This is not inherently a problem (it is easy enough to reverse the ordering to get the intuitive interpretation of pseudotime), but overall it would be a stretch to suggest that TSCAN performs better than PCA on this dataset. (As it is a PCA-based method, perhaps this is not entirely surprising.) Exercise 1 Compare results for different numbers of clusters (clusternum). 11.3 Slingshot Slingshot (Street et al. 2018) is a single-cell lineage inference tool, it can work with datasets with multiple branches. Slingshot has two stages: 1) the inference of the global lineage structure using MST on clustered data points and 2) the inference of pseudotime variables for cells along each lineage by fitting simultaneous ‘principal curves’ across multiple lineages. Slingshot’s first stage uses a cluster-based MST to stably identify the key elements of the global lineage structure, i.e., the number of lineages and where they branch. This allows us to identify novel lineages while also accommodating the use of domain-specific knowledge to supervise parts of the tree (e.g., terminal cellular states). For the second stage, we propose a novel method called simultaneous principal curves, to fit smooth branching curves to these lineages, thereby translating the knowledge of global lineage structure into stable estimates of the underlying cell-level pseudotime variable for each lineage. Slingshot had consistently performing well across different datasets as reported by Saelens et al, let’s have a run for the deng dataset. It is recommended by Slingshot to run in a reduced dimensions. __Note_ Principal curves are smooth one-dimensional curves that pass through the middle of a p-dimensional data set, providing a nonlinear summary of the data. They are nonparametric, and their shape is suggested by the data (Hastie et al)(Hastie and Stuetzle 1989). ## runing slingshot deng_SCE &lt;- slingshot(deng_SCE, clusterLabels = &#39;cell_type2&#39;,reducedDim = &quot;PCA&quot;, allow.breaks = FALSE) ## Using diagonal covariance matrix summary(deng_SCE$slingPseudotime_1) ## Min. 1st Qu. Median Mean 3rd Qu. Max. NA&#39;s ## 0.00 52.19 59.81 60.34 81.60 85.72 55 ## get lineages inferred by slingshot lnes &lt;- getLineages(reducedDim(deng_SCE,&quot;PCA&quot;), deng_SCE$cell_type2) ## Using diagonal covariance matrix lnes@lineages ## $Lineage1 ## [1] &quot;zy&quot; &quot;early2cell&quot; &quot;mid2cell&quot; &quot;late2cell&quot; &quot;4cell&quot; ## [6] &quot;16cell&quot; &quot;midblast&quot; &quot;earlyblast&quot; ## ## $Lineage2 ## [1] &quot;zy&quot; &quot;early2cell&quot; &quot;mid2cell&quot; &quot;late2cell&quot; &quot;4cell&quot; ## [6] &quot;16cell&quot; &quot;midblast&quot; &quot;lateblast&quot; ## ## $Lineage3 ## [1] &quot;zy&quot; &quot;early2cell&quot; &quot;mid2cell&quot; &quot;late2cell&quot; &quot;4cell&quot; ## [6] &quot;16cell&quot; &quot;8cell&quot; ## plot the lineage overlay on the orginal PCA plot plot(reducedDims(deng_SCE)$PCA, col = my_color[as.character(deng_SCE$cell_type2)], pch=16, asp = 1) legend(&quot;bottomleft&quot;,legend = names(my_color[levels(deng_SCE$cell_type2)]), fill = my_color[levels(deng_SCE$cell_type2)]) lines(SlingshotDataSet(deng_SCE), lwd=2, type = &#39;lineages&#39;, col = c(&quot;black&quot;)) ## Plotting the pseudotime inferred by slingshot by cell types slingshot_df &lt;- data.frame(colData(deng_SCE)) ggplot(slingshot_df, aes(x = slingPseudotime_1, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + theme_classic() + xlab(&quot;First Slingshot pseudotime&quot;) + ylab(&quot;cell type&quot;) + ggtitle(&quot;Cells ordered by Slingshot pseudotime&quot;)+scale_colour_manual(values = my_color) ggplot(slingshot_df, aes(x = slingPseudotime_2, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + theme_classic() + xlab(&quot;Second Slingshot pseudotime&quot;) + ylab(&quot;cell type&quot;) + ggtitle(&quot;Cells ordered by Slingshot pseudotime&quot;)+scale_colour_manual(values = my_color) ggplot(slingshot_df, aes(x = slingPseudotime_1, y = slingPseudotime_2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + theme_classic() + xlab(&quot;First Slingshot pseudotime&quot;) + ylab(&quot;Second Slingshot pseudotime&quot;) + ggtitle(&quot;Cells ordered by Slingshot pseudotime&quot;)+scale_colour_manual(values = my_color) # # ggplot(slingshot_df, aes(x = slingPseudotime_1, y = slingPseudotime_2, # colour = slingPseudotime_3)) + # geom_point() + theme_classic() + # xlab(&quot;First Slingshot pseudotime&quot;) + ylab(&quot;Second Slingshot pseudotime&quot;) + # ggtitle(&quot;Cells ordered by Slingshot pseudotime&quot;)+facet_wrap(.~cell_type2) Note You can also supply a start and an end cluster to slingshot. Comments Did you notice the ordering of clusters in the lineage prediced for 16cells state? There is an outlier-like cell in the 16cell group, find the outlier and remove it, then re-run Slingshot. 11.3.1 GAM general additive model for identifying temporally expressed genes After running slingshot, an interesting next step may be to find genes that change their expression over the course of development. We demonstrate one possible method for this type of analysis on the 100 most variable genes. We will regress each gene on the pseudotime variable we have generated, using a general additive model (GAM). This allows us to detect non-linear patterns in gene expression. library(gam) t &lt;- deng_SCE$slingPseudotime_1 # for time, only look at the 100 most variable genes Y &lt;- log1p(assay(deng_SCE,&quot;logcounts&quot;)) var100 &lt;- names(sort(apply(Y,1,var),decreasing = TRUE))[1:100] Y &lt;- Y[var100,] # fit a GAM with a loess term for pseudotime gam.pval &lt;- apply(Y,1,function(z){ d &lt;- data.frame(z=z, t=t) suppressWarnings({ tmp &lt;- gam(z ~ lo(t), data=d) }) p &lt;- summary(tmp)[3][[1]][2,3] p }) ## Plot the top 100 genes&#39; expression topgenes &lt;- names(sort(gam.pval, decreasing = FALSE))[1:100] heatdata &lt;- assays(deng_SCE)$logcounts[topgenes, order(t, na.last = NA)] heatclus &lt;- deng_SCE$cell_type2[order(t, na.last = NA)] heatmap(heatdata, Colv = NA, ColSideColors = my_color[heatclus],cexRow = 1,cexCol = 1) 11.4 Monocle The original Monocle (Trapnell et al. 2014) method skips the clustering stage of TSCAN and directly builds a minimum spanning tree on a reduced dimension representation (using ‘ICA’) of the cells to connect all cells. Monocle then identifies the longest path in this tree as the main branch and uses this to determine pseudotime. Priors are required such as start/end state and the number of branching events. If the data contains diverging trajectories (i.e. one cell type differentiates into two different cell-types), monocle can identify these. Each of the resulting forked paths is defined as a separate cell state. 11.4.1 Monocle 2 Monocle 2 (Qiu et al. 2017) uses a different approach, with dimensionality reduction and ordering performed by reverse graph embedding (RGE), allowing it to detect branching events in an unsupervised manner. RGE, a machine-learning strategy, learns a ‘principal graph’ to describe the single-cell dataset. RGE also learns the mapping function of data points on the trajectory back to the original high dimentional space simutaneously. In doing so, it aims to position the latent points in the lower dimension space (along the trajectory) while also ensuring their corresponding positions in the input dimension are ‘neighbors’. There are different ways of implementing the RGE framework, Monocle 2 uses DDRTree(Discriminative dimensionality reduction via learning a tree) by default. DDRTree learns latent points and the projection of latent points to the points in original input space, which is equivalent to “dimension reduction”. In addition, it simutanously learns ‘principal graph’ for K-means soft clustered centroids for the latent points. Principal graph is the spanning tree of those centroids. DDRTree returns a principal tree of the centroids of cell clusters in low dimension, pseudotime is derived for individual cells by calculating geomdestic distance of their projections onto the tree from the root (user-defined or arbitrarily assigned). Note Informally, a principal graph is like a principal curve which passes through the ‘middle’ of a data set but is allowed to have branches. library(monocle) #d &lt;- deng_SCE[m3dGenes,] ## feature selection deng &lt;- counts(deng_SCE) m3dGenes &lt;- as.character( M3DropFeatureSelection(deng)$Gene ) d &lt;- deng_SCE[which(rownames(deng_SCE) %in% m3dGenes), ] d &lt;- d[!duplicated(rownames(d)), ] colnames(d) &lt;- 1:ncol(d) geneNames &lt;- rownames(d) rownames(d) &lt;- 1:nrow(d) pd &lt;- data.frame(timepoint = cellLabels) pd &lt;- new(&quot;AnnotatedDataFrame&quot;, data=pd) fd &lt;- data.frame(gene_short_name = geneNames) fd &lt;- new(&quot;AnnotatedDataFrame&quot;, data=fd) dCellData &lt;- newCellDataSet(counts(d), phenoData = pd, featureData = fd) # dCellData &lt;- setOrderingFilter(dCellData, which(geneNames %in% m3dGenes)) dCellData &lt;- estimateSizeFactors(dCellData) dCellDataSet &lt;- reduceDimension(dCellData,reduction_method = &quot;DDRTree&quot;, pseudo_expr = 1) dCellDataSet &lt;- orderCells(dCellDataSet, reverse = FALSE) plot_cell_trajectory(dCellDataSet) # Store the ordering pseudotime_monocle2 &lt;- data.frame( Timepoint = phenoData(dCellDataSet)$timepoint, pseudotime = phenoData(dCellDataSet)$Pseudotime, State = phenoData(dCellDataSet)$State ) rownames(pseudotime_monocle2) &lt;- 1:ncol(d) pseudotime_order_monocle &lt;- rownames(pseudotime_monocle2[order(pseudotime_monocle2$pseudotime), ]) Note check other available methods for ?reduceDimension We can again compare the inferred pseudotime to the known sampling timepoints. deng_SCE$pseudotime_monocle2 &lt;- pseudotime_monocle2$pseudotime ggplot(as.data.frame(colData(deng_SCE)), aes(x = pseudotime_monocle2, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + scale_color_manual(values = my_color) + theme_classic() + xlab(&quot;monocle2 pseudotime&quot;) + ylab(&quot;Timepoint&quot;) + ggtitle(&quot;Cells ordered by monocle2 pseudotime&quot;) Monocle 2 performs pretty well on these cells. 11.4.2 Monocle 3 Monocle3(Cao et al. 2019) is the updated single-cell analysis toolkit for analysing large datasets. Monocle 3 is designed for use with absolute transcript counts (e.g. from UMI experiments). It first does dimension reduction with UMAP, then it clusters the cells with Louvain/Leiden algorithms and merge adjacent groups into supergroup, and finaly resovles the trajectories individual cells can take during development within each supergroup. In short, Monocle3 uses UMAP to construct a initial trajectory inference and refines it with learning principal graph. It builds KNN graph in the UMAP dimensions and runs Louvain/Leiden algorithms on the KNN graph to derive communities; edges are drawn to connect communities that have more links (Partitioned Approximate Graph Abstraction (PAGA) graph). Each component of the PAGA graph is passed to the next step which is learning principal graph based on the SimplePPT algorithm. The pseudotime is calculated for individual cells by projecting the cells to their nearest point on the principal graph edge and measure geodestic distance along of principal points to the closest of their root nodes. library(monocle3) ## ## Attaching package: &#39;monocle3&#39; ## The following objects are masked from &#39;package:monocle&#39;: ## ## plot_genes_in_pseudotime, plot_genes_violin, ## plot_pc_variance_explained ## The following objects are masked from &#39;package:Biobase&#39;: ## ## exprs, fData, fData&lt;-, pData, pData&lt;- gene_meta &lt;- rowData(deng_SCE) #gene_metadata must contain a column verbatim named &#39;gene_short_name&#39; for certain functions. gene_meta$gene_short_name &lt;- rownames(gene_meta) cds &lt;- new_cell_data_set(expression_data = counts(deng_SCE), cell_metadata = colData(deng_SCE), gene_metadata = gene_meta) ## Step 1: Normalize and pre-process the data cds &lt;- preprocess_cds(cds,num_dim = 5) plot_pc_variance_explained(cds) ## Step 3: Reduce the dimensions using UMAP cds &lt;- reduce_dimension(cds) ## No preprocess_method specified, using preprocess_method = &#39;PCA&#39; ## Step 4: Cluster the cells cds &lt;- cluster_cells(cds) ## change the clusters ## cds@clusters$UMAP$clusters &lt;- deng_SCE$cell_type2 ## Step 5: Learn a graph cds &lt;- learn_graph(cds,use_partition = TRUE) ## Step 6: Order cells cds &lt;- order_cells(cds, root_cells = c(&quot;zy&quot;,&quot;zy.1&quot;,&quot;zy.2&quot;,&quot;zy.3&quot;) ) plot_cells(cds, color_cells_by=&quot;cell_type2&quot;, graph_label_size = 4, cell_size = 2, group_label_size = 6)+ scale_color_manual(values = my_color) plot_cells(cds, graph_label_size = 6, cell_size = 1, color_cells_by=&quot;pseudotime&quot;, group_label_size = 6) ## Cells aren&#39;t colored in a way that allows them to be grouped. pdata_cds &lt;- pData(cds) pdata_cds$pseudotime_monocle3 &lt;- monocle3::pseudotime(cds) ggplot(as.data.frame(pdata_cds), aes(x = pseudotime_monocle3, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + scale_color_manual(values = my_color) + theme_classic() + xlab(&quot;monocle3 pseudotime&quot;) + ylab(&quot;Timepoint&quot;) + ggtitle(&quot;Cells ordered by monocle3 pseudotime&quot;) deng_SCE$pseudotime_monocle3 &lt;- pdata_cds$pseudotime_monocle3 It did not work well for our small Smart-seq2 dataset. 11.4.3 Diffusion maps Diffusion maps were introduced by Ronald Coifman and Stephane Lafon(Coifman and Lafon 2006), and the underlying idea is to assume that the data are samples from a diffusion process. The method infers the low-dimensional manifold by estimating the eigenvalues and eigenvectors for the diffusion operator related to the data. Angerer et al(Angerer et al. 2016) have applied the diffusion maps concept to the analysis of single-cell RNA-seq data to create an R package called destiny. We will take the ranko prder of cells in the first diffusion map component as “diffusion map pseudotime” here. deng &lt;- logcounts(deng_SCE) colnames(deng) &lt;- cellLabels dm &lt;- DiffusionMap(t(deng)) tmp &lt;- data.frame(DC1 = eigenvectors(dm)[,1], DC2 = eigenvectors(dm)[,2], Timepoint = deng_SCE$cell_type2) ggplot(tmp, aes(x = DC1, y = DC2, colour = Timepoint)) + geom_point() + scale_color_manual(values = my_color) + xlab(&quot;Diffusion component 1&quot;) + ylab(&quot;Diffusion component 2&quot;) + theme_classic() deng_SCE$pseudotime_diffusionmap &lt;- rank(eigenvectors(dm)[,1]) ggplot(as.data.frame(colData(deng_SCE)), aes(x = pseudotime_diffusionmap, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + scale_color_manual(values = my_color) + theme_classic() + xlab(&quot;Diffusion map pseudotime (first diffusion map component)&quot;) + ylab(&quot;Timepoint&quot;) + ggtitle(&quot;Cells ordered by diffusion map pseudotime&quot;) Like the other methods, using the first diffusion map component from destiny as pseudotime does a good job at ordering the early time-points (if we take high values as “earlier” in developement), but it is unable to distinguish the later ones. Exercise 2 Do you get a better resolution between the later time points by considering additional eigenvectors? Exercise 3 How does the ordering change if you only use the genes identified by M3Drop? 11.5 Other methods 11.5.1 SLICER The SLICER(Welch, Hartemink, and Prins 2016) method is an algorithm for constructing trajectories that describe gene expression changes during a sequential biological process, just as Monocle and TSCAN are. SLICER is designed to capture highly nonlinear gene expression changes, automatically select genes related to the process, and detect multiple branch and loop features in the trajectory (Welch, Hartemink, and Prins 2016). The SLICER R package is available from its GitHub repository and can be installed from there using the devtools package. We use the select_genes function in SLICER to automatically select the genes to use in builing the cell trajectory. The function uses “neighbourhood variance” to identify genes that vary smoothly, rather than fluctuating randomly, across the set of cells. Following this, we determine which value of “k” (number of nearest neighbours) yields an embedding that most resembles a trajectory. Then we estimate the locally linear embedding of the cells. library(&quot;lle&quot;) slicer_genes &lt;- select_genes(t(deng)) k &lt;- select_k(t(deng[slicer_genes,]), kmin = 30, kmax=60) ## finding neighbours ## calculating weights ## computing coordinates ## finding neighbours ## calculating weights ## computing coordinates ## finding neighbours ## calculating weights ## computing coordinates ## finding neighbours ## calculating weights ## computing coordinates ## finding neighbours ## calculating weights ## computing coordinates ## finding neighbours ## calculating weights ## computing coordinates ## finding neighbours ## calculating weights ## computing coordinates slicer_traj_lle &lt;- lle(t(deng[slicer_genes,]), m = 2, k)$Y ## finding neighbours ## calculating weights ## computing coordinates reducedDim(deng_SCE, &quot;LLE&quot;) &lt;- slicer_traj_lle plot_df &lt;- data.frame(slicer1 = reducedDim(deng_SCE, &quot;LLE&quot;)[,1], slicer2 = reducedDim(deng_SCE, &quot;LLE&quot;)[,2], cell_type2 = deng_SCE$cell_type2) ggplot(data = plot_df)+geom_point(mapping = aes(x = slicer1, y = slicer2, color = cell_type2))+ scale_color_manual(values = my_color)+ xlab(&quot;LLE component 1&quot;) + ylab(&quot;LLE component 2&quot;) + ggtitle(&quot;Locally linear embedding of cells from SLICER&quot;)+ theme_classic() With the locally linear embedding computed we can construct a k-nearest neighbour graph that is fully connected. This plot displays a (yellow) circle for each cell, with the cell ID number overlaid in blue. Here we show the graph computed using 10 nearest neighbours. Here, SLICER appears to detect one major trajectory with one branch. slicer_traj_graph &lt;- conn_knn_graph(slicer_traj_lle, 10) plot(slicer_traj_graph, main = &quot;Fully connected kNN graph from SLICER&quot;) From this graph we can identify “extreme” cells that are candidates for start/end cells in the trajectory. ends &lt;- find_extreme_cells(slicer_traj_graph, slicer_traj_lle) start &lt;- ends[1] Having defined a start cell we can order the cells in the estimated pseudotime. pseudotime_order_slicer &lt;- cell_order(slicer_traj_graph, start) branches &lt;- assign_branches(slicer_traj_graph, start) pseudotime_slicer &lt;- data.frame( Timepoint = cellLabels, pseudotime = NA, State = branches ) pseudotime_slicer$pseudotime[pseudotime_order_slicer] &lt;- 1:length(pseudotime_order_slicer) deng_SCE$pseudotime_slicer &lt;- pseudotime_slicer$pseudotime We can again compare the inferred pseudotime to the known sampling timepoints. SLICER does not provide a pseudotime value per se, just an ordering of cells. ggplot(as.data.frame(colData(deng_SCE)), aes(x = pseudotime_slicer, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + scale_color_manual(values = my_color) + theme_classic() + xlab(&quot;SLICER pseudotime (cell ordering)&quot;) + ylab(&quot;Timepoint&quot;) + theme_classic() Like the previous method, SLICER (Welch, Hartemink, and Prins 2016) here provides a good ordering for the early time points. It places “16cell” cells before “8cell” cells, but provides better ordering for blast cells than many of the earlier methods. Exercise 4 How do the results change for different k? (e.g. k = 5) What about changing the number of nearest neighbours in the call to conn_knn_graph? Exercise 5 How does the ordering change if you use a different set of genes from those chosen by SLICER (e.g. the genes identified by M3Drop)? 11.5.2 Ouija Ouija (http://kieranrcampbell.github.io/ouija/) takes a different approach from the pseudotime estimation methods we have looked at so far. Earlier methods have all been “unsupervised”, which is to say that apart from perhaps selecting informative genes we do not supply the method with any prior information about how we expect certain genes or the trajectory as a whole to behave. Ouija, in contrast, is a probabilistic framework that allows for interpretable learning of single-cell pseudotimes using only small panels of marker genes. This method: infers pseudotimes from a small number of marker genes letting you understand why the pseudotimes have been learned in terms of those genes; provides parameter estimates (with uncertainty) for interpretable gene regulation behaviour (such as the peak time or the upregulation time); has a Bayesian hypothesis test to find genes regulated before others along the trajectory; identifies metastable states, ie discrete cell types along the continuous trajectory. We will supply the following marker genes to Ouija (with timepoints where they are expected to be highly expressed): Early timepoints: Dazl, Rnf17, Sycp3, Nanog, Pou5f1, Fgf8, Egfr, Bmp5, Bmp15 Mid timepoints: Zscan4b, Foxa1, Prdm14, Sox21 Late timepoints: Creb3, Gpx4, Krt8, Elf5, Eomes, Cdx2, Tdgf1, Gdf3 With Ouija we can model genes as either exhibiting monotonic up or down regulation (known as switch-like behaviour), or transient behaviour where the gene briefly peaks. By default, Ouija assumes all genes exhibit switch-like behaviour (the authors assure us not to worry if we get it wrong - the noise model means incorrectly specifying a transient gene as switch-like has minimal effect). Here we can “cheat” a little and check that our selected marker genes do actually identify different timepoints of the differentiation process. ouija_markers_down &lt;- c(&quot;Dazl&quot;, &quot;Rnf17&quot;, &quot;Sycp3&quot;, &quot;Fgf8&quot;, &quot;Egfr&quot;, &quot;Bmp5&quot;, &quot;Bmp15&quot;, &quot;Pou5f1&quot;) ouija_markers_up &lt;- c(&quot;Creb3&quot;, &quot;Gpx4&quot;, &quot;Krt8&quot;, &quot;Elf5&quot;, &quot;Cdx2&quot;, &quot;Tdgf1&quot;, &quot;Gdf3&quot;, &quot;Eomes&quot;) ouija_markers_transient &lt;- c(&quot;Zscan4b&quot;, &quot;Foxa1&quot;, &quot;Prdm14&quot;, &quot;Sox21&quot;) ouija_markers &lt;- c(ouija_markers_down, ouija_markers_up, ouija_markers_transient) plotExpression(deng_SCE, ouija_markers, x = &quot;cell_type2&quot;, colour_by = &quot;cell_type2&quot;) + theme(axis.text.x = element_text(angle = 60, hjust = 1)) In order to fit the pseudotimes wesimply call ouija, passing in the expected response types. Note that if no response types are provided then they are all assumed to be switch-like by default, which we will do here. The input to Ouija can be a cell-by-gene matrix of non-negative expression values, or an ExpressionSet object, or, happily, by selecting the logcounts values from a SingleCellExperiment object. We can apply prior information about whether genes are up- or down-regulated across the differentiation process, and also provide prior information about when the switch in expression or a peak in expression is likely to occur. We can fit the Ouija model using either: Hamiltonian Monte Carlo (HMC) - full MCMC inference where gradient information of the log-posterior is used to “guide” the random walk through the parameter space, or Automatic Differentiation Variational Bayes (ADVI or simply VI) - approximate inference where the KL divergence to an approximate distribution is minimised. In general, HMC will provide more accurate inference with approximately correct posterior variance for all parameters. However, VB is orders of magnitude quicker than HMC and while it may underestimate posterior variance, the Ouija authors suggest that anecdotally it often performs as well as HMC for discovering posterior pseudotimes. To help the Ouija model, we provide it with prior information about the strength of switches for up- and down-regulated genes. By setting switch strength to -10 for down-regulated genes and 10 for up-regulated genes with a prior strength standard deviation of 0.5 we are telling the model that we are confident about the expected behaviour of these genes across the differentiation process. options(mc.cores = parallel::detectCores()) response_type &lt;- c(rep(&quot;switch&quot;, length(ouija_markers_down) + length(ouija_markers_up)), rep(&quot;transient&quot;, length(ouija_markers_transient))) switch_strengths &lt;- c(rep(-10, length(ouija_markers_down)), rep(10, length(ouija_markers_up))) switch_strength_sd &lt;- c(rep(0.5, length(ouija_markers_down)), rep(0.5, length(ouija_markers_up))) garbage &lt;- capture.output( oui_vb &lt;- ouija(deng_SCE[ouija_markers,], single_cell_experiment_assay = &quot;logcounts&quot;, response_type = response_type, switch_strengths = switch_strengths, switch_strength_sd = switch_strength_sd, inference_type = &quot;vb&quot;) ) print(oui_vb) ## A Ouija fit with 268 cells and 20 marker genes ## Inference type: Variational Bayes ## (Gene behaviour) Switch/transient: 16 / 4 We can plot the gene expression over pseudotime along with the maximum a posteriori (MAP) estimates of the mean function (the sigmoid or Gaussian transient function) using the plot_expression function. plot_expression(oui_vb) We can also visualise when in the trajectory gene regulation behaviour occurs, either in the form of the switch time or the peak time (for switch-like or transient genes) using the plot_switch_times and plot_transient_times functions: plot_switch_times(oui_vb) plot_peak_times(oui_vb) Identify metastable states using consistency matrices. cmo &lt;- consistency_matrix(oui_vb) plot_consistency(oui_vb) cell_classifications &lt;- cluster_consistency(cmo) map_pst &lt;- map_pseudotime(oui_vb) ouija_pseudotime &lt;- data.frame(map_pst, cell_classifications) ggplot(ouija_pseudotime, aes(x = map_pst, y = cell_classifications)) + geom_point() + xlab(&quot;MAP pseudotime&quot;) + ylab(&quot;Cell classification&quot;) deng_SCE$pseudotime_ouija &lt;- ouija_pseudotime$map_pst deng_SCE$ouija_cell_class &lt;- ouija_pseudotime$cell_classifications ggplot(as.data.frame(colData(deng_SCE)), aes(x = pseudotime_ouija, y = cell_type2, colour = cell_type2)) + geom_quasirandom(groupOnX = FALSE) + scale_color_manual(values = my_color) + theme_classic() + xlab(&quot;Ouija pseudotime&quot;) + ylab(&quot;Timepoint&quot;) + theme_classic() Ouija does quite well in the ordering of the cells here, although it can be sensitive to the choice of marker genes and prior information supplied. How do the results change if you select different marker genes or change the priors? Ouija identifies four metastable states here, which we might annotate as “zygote/2cell”, “4/8/16 cell”, “blast1” and “blast2”. ggplot(as.data.frame(colData(deng_SCE)), aes(x = as.factor(ouija_cell_class), y = pseudotime_ouija, colour = cell_type2)) + geom_boxplot() + coord_flip() + scale_color_manual(values = my_color) + theme_classic() + xlab(&quot;Ouija cell classification&quot;) + ylab(&quot;Ouija pseudotime&quot;) + theme_classic() A common analysis is to work out the regulation orderings of genes. For example, is gene A upregulated before gene B? Does gene C peak before the downregulation of gene D? Ouija answers these questions in terms of a Bayesian hypothesis test of whether the difference in regulation timing (either switch time or peak time) is significantly different to 0. This is collated using the gene_regulation function. gene_regs &lt;- gene_regulation(oui_vb) head(gene_regs) ## # A tibble: 6 x 7 ## # Groups: label, gene_A [6] ## label gene_A gene_B mean_difference lower_95 upper_95 significant ## &lt;chr&gt; &lt;chr&gt; &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;lgl&gt; ## 1 Bmp15 - Cdx2 Bmp15 Cdx2 -0.0631 -0.109 -0.0133 TRUE ## 2 Bmp15 - Creb3 Bmp15 Creb3 0.269 0.201 0.321 TRUE ## 3 Bmp15 - Elf5 Bmp15 Elf5 -0.678 -0.718 -0.644 TRUE ## 4 Bmp15 - Eomes Bmp15 Eomes 0.0822 0.00272 0.156 TRUE ## 5 Bmp15 - Foxa1 Bmp15 Foxa1 -0.0211 -0.0508 0.0120 FALSE ## 6 Bmp15 - Gdf3 Bmp15 Gdf3 0.0644 0.0163 0.126 TRUE What conclusions can you draw from the gene regulation output from Ouija? If you have time, you might try the HMC inference method and see if that changes the Ouija results in any way. 11.6 Comparison of the methods How do the trajectories inferred by TSCAN, Monocle, Diffusion Map, SLICER and Ouija compare? TSCAN and Diffusion Map methods get the trajectory the “wrong way round”, so we’ll adjust that for these comparisons. df_pseudotime &lt;- as.data.frame( colData(deng_SCE)[, grep(&quot;pseudotime&quot;, colnames(colData(deng_SCE)))] ) colnames(df_pseudotime) &lt;- gsub(&quot;pseudotime_&quot;, &quot;&quot;, colnames(df_pseudotime)) df_pseudotime$PC1 &lt;- reducedDim(deng_SCE,&quot;PCA&quot;)[,1] df_pseudotime$order_tscan &lt;- -df_pseudotime$order_tscan #df_pseudotime$diffusionmap &lt;- df_pseudotime$diffusionmap df_pseudotime$slingshot1 &lt;- colData(deng_SCE)$slingPseudotime_1 corrplot.mixed(cor(df_pseudotime, use = &quot;na.or.complete&quot;), order = &quot;hclust&quot;, tl.col = &quot;black&quot;, main = &quot;Correlation matrix for pseudotime results&quot;, mar = c(0, 0, 3.1, 0)) We see here that Ouija, TSCAN and SLICER all give trajectories that are similar and strongly correlated with PC1. Diffusion Map is less strongly correlated with these methods, and Monocle gives very different results. 11.7 Expression of genes through time Each package also enables the visualization of expression through pseudotime. Following individual genes is very helpful for identifying genes that play an important role in the differentiation process. We illustrate the procedure using the Nanog gene. We have added the pseudotime values computed with all methods here to the colData slot of an SCE object. Having done that, the full plotting capabilities of the scater package can be used to investigate relationships between gene expression, cell populations and pseudotime. This is particularly useful for the packages such as SLICER that do not provide plotting functions. Principal components deng_SCE$PC1 &lt;- reducedDim(deng_SCE,&quot;PCA&quot;)[,1] plotExpression(deng_SCE, &quot;Nanog&quot;, x = &quot;PC1&quot;, colour_by = &quot;cell_type2&quot;, show_violin = FALSE, show_smooth = TRUE) TSCAN plotExpression(deng_SCE, &quot;Nanog&quot;, x = &quot;pseudotime_order_tscan&quot;, colour_by = &quot;cell_type2&quot;, show_violin = FALSE, show_smooth = TRUE) Monocle plotExpression(deng_SCE, &quot;Nanog&quot;, x = &quot;pseudotime_monocle2&quot;, colour_by = &quot;cell_type2&quot;, show_violin = FALSE, show_smooth = TRUE) Diffusion Map plotExpression(deng_SCE, &quot;Nanog&quot;, x = &quot;pseudotime_diffusionmap&quot;, colour_by = &quot;cell_type2&quot;, show_violin = FALSE, show_smooth = TRUE) SLICER plotExpression(deng_SCE, &quot;Nanog&quot;, x = &quot;pseudotime_slicer&quot;, colour_by = &quot;cell_type2&quot;, show_violin = FALSE, show_smooth = TRUE) Ouija plotExpression(deng_SCE, &quot;Nanog&quot;, x = &quot;pseudotime_ouija&quot;, colour_by = &quot;cell_type2&quot;, show_violin = FALSE, show_smooth = TRUE) Q: How many of these methods outperform the naive approach of using the first principal component to represent pseudotime for these data? Exercise 7: Repeat the exercise using a subset of the genes, e.g. the set of highly variable genes that can be obtained using one of the methods discussed in the Feature Selection chapter. 11.7.1 dynverse https://dynverse.org/users/2-quick_start/ library(dyno) library(tidyverse) # Reproduces the guidelines as created in the shiny app answers &lt;- dynguidelines::answer_questions( multiple_disconnected = FALSE, expect_topology = TRUE, expected_topology = &quot;linear&quot;, n_cells = 3000, n_features = 10000, memory = &quot;100GB&quot;, docker = FALSE ) guidelines &lt;- dynguidelines::guidelines(answers = answers) guidelines deng_dataset &lt;- wrap_expression( counts = counts(deng_SCE), expression = assay(deng_SCE,&quot;logcounts&quot;) ) model &lt;- infer_trajectory(deng_dataset, first(guidelines$methods_selected)) ## Loading required namespace: hdf5r model &lt;- model %&gt;% add_dimred(dyndimred::dimred_mds, expression_source = deng_dataset$expression) plot_dimred( model, expression_source = deng_dataset$expression, grouping = deng_SCE$cell_type2 ) 11.7.2 sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] splines parallel stats4 stats graphics grDevices utils ## [8] datasets methods base ## ## other attached packages: ## [1] rstan_2.19.2 StanHeaders_2.19.0 ## [3] lle_1.1 snowfall_1.84-6.1 ## [5] snow_0.4-3 MASS_7.3-51.1 ## [7] scatterplot3d_0.3-41 monocle3_0.2.0 ## [9] gam_1.16.1 foreach_1.4.7 ## [11] ouija_0.99.0 Rcpp_1.0.2 ## [13] SLICER_0.2.0 slingshot_1.2.0 ## [15] princurve_2.1.4 Polychrome_1.2.3 ## [17] corrplot_0.84 ggbeeswarm_0.6.0 ## [19] ggthemes_4.2.0 scater_1.12.2 ## [21] destiny_2.14.0 monocle_2.12.0 ## [23] DDRTree_0.1.5 irlba_2.3.3 ## [25] VGAM_1.1-1 ggplot2_3.2.1 ## [27] Matrix_1.2-17 M3Drop_1.10.0 ## [29] numDeriv_2016.8-1.1 TSCAN_1.22.0 ## [31] SingleCellExperiment_1.6.0 SummarizedExperiment_1.14.1 ## [33] DelayedArray_0.10.0 BiocParallel_1.18.1 ## [35] matrixStats_0.55.0 Biobase_2.44.0 ## [37] GenomicRanges_1.36.1 GenomeInfoDb_1.20.0 ## [39] IRanges_2.18.3 S4Vectors_0.22.1 ## [41] BiocGenerics_0.30.0 ## ## loaded via a namespace (and not attached): ## [1] rgl_0.100.30 rsvd_1.0.2 ## [3] vcd_1.4-4 Hmisc_4.2-0 ## [5] zinbwave_1.6.0 corpcor_1.6.9 ## [7] ps_1.3.0 class_7.3-15 ## [9] lmtest_0.9-37 glmnet_2.0-18 ## [11] crayon_1.3.4 laeken_0.5.0 ## [13] nlme_3.1-139 backports_1.1.4 ## [15] qlcMatrix_0.9.7 rlang_0.4.0 ## [17] XVector_0.24.0 readxl_1.3.1 ## [19] callr_3.3.2 limma_3.40.6 ## [21] phylobase_0.8.6 smoother_1.1 ## [23] manipulateWidget_0.10.0 bit64_0.9-7 ## [25] loo_2.1.0 glue_1.3.1 ## [27] pheatmap_1.0.12 rngtools_1.4 ## [29] splancs_2.01-40 processx_3.4.1 ## [31] vipor_0.4.5 AnnotationDbi_1.46.1 ## [33] haven_2.1.1 tidyselect_0.2.5 ## [35] rio_0.5.16 XML_3.98-1.20 ## [37] tidyr_1.0.0 zoo_1.8-6 ## [39] xtable_1.8-4 magrittr_1.5 ## [41] evaluate_0.14 bibtex_0.4.2 ## [43] cli_1.1.0 zlibbioc_1.30.0 ## [45] rstudioapi_0.10 miniUI_0.1.1.1 ## [47] sp_1.3-1 rpart_4.1-15 ## [49] locfdr_1.1-8 RcppEigen_0.3.3.5.0 ## [51] shiny_1.3.2 BiocSingular_1.0.0 ## [53] xfun_0.9 leidenbase_0.1.0 ## [55] inline_0.3.15 pkgbuild_1.0.5 ## [57] cluster_2.1.0 caTools_1.17.1.2 ## [59] sgeostat_1.0-27 tibble_2.1.3 ## [61] ggrepel_0.8.1 ape_5.3 ## [63] stabledist_0.7-1 zeallot_0.1.0 ## [65] withr_2.1.2 bitops_1.0-6 ## [67] slam_0.1-45 ranger_0.11.2 ## [69] plyr_1.8.4 cellranger_1.1.0 ## [71] pcaPP_1.9-73 sparsesvd_0.2 ## [73] coda_0.19-3 e1071_1.7-2 ## [75] RcppParallel_4.4.3 pillar_1.4.2 ## [77] gplots_3.0.1.1 reldist_1.6-6 ## [79] kernlab_0.9-27 TTR_0.23-5 ## [81] ellipsis_0.3.0 tripack_1.3-8 ## [83] DelayedMatrixStats_1.6.1 xts_0.11-2 ## [85] vctrs_0.2.0 NMF_0.21.0 ## [87] tools_3.6.0 foreign_0.8-70 ## [89] rncl_0.8.3 beeswarm_0.2.3 ## [91] munsell_0.5.0 proxy_0.4-23 ## [93] HSMMSingleCell_1.4.0 compiler_3.6.0 ## [95] abind_1.4-5 httpuv_1.5.2 ## [97] pkgmaker_0.27 GenomeInfoDbData_1.2.1 ## [99] gridExtra_2.3 edgeR_3.26.8 ## [101] lattice_0.20-38 deldir_0.1-23 ## [103] utf8_1.1.4 later_0.8.0 ## [105] dplyr_0.8.3 jsonlite_1.6 ## [107] scales_1.0.0 docopt_0.6.1 ## [109] carData_3.0-2 genefilter_1.66.0 ## [111] lazyeval_0.2.2 promises_1.0.1 ## [113] spatstat_1.61-0 car_3.0-3 ## [115] doParallel_1.0.15 latticeExtra_0.6-28 ## [117] R.utils_2.9.0 goftest_1.1-1 ## [119] spatstat.utils_1.13-0 checkmate_1.9.4 ## [121] cowplot_1.0.0 rmarkdown_1.15 ## [123] openxlsx_4.1.0.1 statmod_1.4.32 ## [125] webshot_0.5.1 Rtsne_0.15 ## [127] forcats_0.4.0 copula_0.999-19.1 ## [129] softImpute_1.4 uwot_0.1.4 ## [131] igraph_1.2.4.1 HDF5Array_1.12.2 ## [133] survival_2.43-3 yaml_2.2.0 ## [135] htmltools_0.3.6 memoise_1.1.0 ## [137] locfit_1.5-9.1 viridisLite_0.3.0 ## [139] digest_0.6.21 assertthat_0.2.1 ## [141] mime_0.7 densityClust_0.3 ## [143] registry_0.5-1 RSQLite_2.1.2 ## [145] data.table_1.12.2 blob_1.2.0 ## [147] R.oo_1.22.0 RNeXML_2.3.0 ## [149] labeling_0.3 fastICA_1.2-2 ## [151] Formula_1.2-3 Rhdf5lib_1.6.1 ## [153] RCurl_1.95-4.12 hms_0.5.1 ## [155] rhdf5_2.28.0 colorspace_1.4-1 ## [157] base64enc_0.1-3 nnet_7.3-12 ## [159] ADGofTest_0.3 mclust_5.4.5 ## [161] bookdown_0.13 RANN_2.6.1 ## [163] mvtnorm_1.0-11 fansi_0.4.0 ## [165] pspline_1.0-18 VIM_4.8.0 ## [167] R6_2.4.0 grid_3.6.0 ## [169] lifecycle_0.1.0 acepack_1.4.1 ## [171] zip_2.0.4 curl_4.2 ## [173] gdata_2.18.0 robustbase_0.93-5 ## [175] howmany_0.3-1 RcppAnnoy_0.0.13 ## [177] RColorBrewer_1.1-2 MCMCglmm_2.29 ## [179] iterators_1.0.12 alphahull_2.2 ## [181] stringr_1.4.0 htmlwidgets_1.3 ## [183] polyclip_1.10-0 purrr_0.3.2 ## [185] crosstalk_1.0.0 mgcv_1.8-28 ## [187] tensorA_0.36.1 htmlTable_1.13.2 ## [189] clusterExperiment_2.4.4 codetools_0.2-16 ## [191] FNN_1.1.3 gtools_3.8.1 ## [193] prettyunits_1.0.2 gridBase_0.4-7 ## [195] RSpectra_0.15-0 R.methodsS3_1.7.1 ## [197] gtable_0.3.0 DBI_1.0.0 ## [199] highr_0.8 tensor_1.5 ## [201] httr_1.4.1 KernSmooth_2.23-15 ## [203] stringi_1.4.3 progress_1.2.2 ## [205] reshape2_1.4.3 uuid_0.1-2 ## [207] cubature_2.0.3 annotate_1.62.0 ## [209] viridis_0.5.1 xml2_1.2.2 ## [211] combinat_0.0-8 bbmle_1.0.20 ## [213] boot_1.3-20 BiocNeighbors_1.2.0 ## [215] ade4_1.7-13 DEoptimR_1.0-8 ## [217] bit_1.1-14 spatstat.data_1.4-0 ## [219] pkgconfig_2.0.3 gsl_2.1-6 ## [221] knitr_1.25 References "],
 ["dechapter.html", "12 Differential Expression (DE) analysis 12.1 Introduction to DE analysis 12.2 DE in a real dataset", " 12 Differential Expression (DE) analysis 12.1 Introduction to DE analysis 12.1.1 Bulk RNA-seq One of the most common types of analyses when working with bulk RNA-seq data is to identify differentially expressed genes. By comparing the genes that change between two or more conditions, e.g. mutant and wild-type or stimulated and unstimulated, it is possible to characterize the molecular mechanisms underlying the change. Several different methods, e.g. edgeR and DESeq2 and more, have been developed for bulk RNA-seq and become established as parts of robust and widely-used analysis workflows. Moreover, there are also extensive datasets available where the RNA-seq data has been validated using RT-qPCR. These data can be used to benchmark DE finding algorithms and the available evidence suggests that the algorithms are performing well. 12.1.2 Single cell RNA-seq In contrast to bulk RNA-seq, in scRNA-seq we often do not have a defined set of experimental conditions. Instead, as was shown in a previous chapter (10.2) we can identify the cell groups by using an unsupervised clustering approach. Once the groups have been identified one can find differentially expressed genes either by comparing the differences in variance between the groups (like the Kruskal-Wallis test implemented in SC3), or by comparing gene expression between clusters in a pairwise manner. In the following chapter we will mainly consider tools developed for pairwise comparisons. These method may also be applied when comparing cells obtained from different groups or conditions. Such analyses can be complicated by differing cell type proportions between samples (i.e. distinct samples cell populations; the unit of replication in the study). In such cases, it is likely beneficial to identify distinct cell types and conduct differential expression testing between conditions within each cell type. 12.1.3 Differences in Distribution Unlike bulk RNA-seq, we generally have a large number of samples (i.e. cells) for each group we are comparing in single-cell experiments. Thus we may be able to take advantage of the whole distribution of expression values in each group to identify differences between groups rather than only comparing estimates of mean-expression as is standard for bulk RNASeq. There are two main approaches to comparing distributions. Firstly, we can use existing statistical models/distributions and fit the same type of model to the expression in each group then test for differences in the parameters for each model, or test whether the model fits better if a particular parameter is allowed to be different according to group. For instance in Chapter ?? we used edgeR to test whether allowing mean expression to be different in different batches significantly improved the fit of a negative binomial model of the data. Alternatively, we can use a non-parametric test which does not assume that expression values follow any particular distribution, e.g. the Kolmogorov-Smirnov test (KS-test). Non-parametric tests generally convert observed expression values to ranks and test whether the distribution of ranks for one group are signficantly different from the distribution of ranks for the other group. However, some non-parametric methods fail in the presence of a large number of tied values, such as the case for dropouts (zeros) in single-cell RNA-seq expression data. Moreover, if the conditions for a parametric test hold, then it will typically be more powerful than a non-parametric test. 12.1.4 Benchmarking of DE methods for scRNA-seq data So far there has been one high-quality benchmarking study of single-cell differential expression methods (Soneson and Robinson 2018). The figure below summarises the results from that paper (which is well worth reading in full!): knitr::include_graphics(&quot;figures/soneson-de-benchmark-fig5.png&quot;) Figure 12.1: Figure 5 reproduced from Soneson and Robinson (2018). Summary of DE method performance across all major evaluation criteria. Criteria and cutoff values for performance categories are available in the Online Methods. Methods are ranked by their average performance across the criteria, with the numerical encoding good = 2, intermediate = 1, poor = 0. NODES and SAMseq do not return nominal P values and were therefore not evaluated in terms of the FPR. One particularly surprising outcome of this benchmarking study is that almost all methods designed specifically for the analysis of scRNA-seq data are outperformed by established bulk RNA-seq DE methods (edgeR, limma) and standard, classical statistical methods (t-test, Wilcoxon rank-sum tests). MAST (Finak et al. 2015) is the only method designed specifically for scRNA-seq data that performs well in this benchmark. These benchmarking results are a credit to the durability and flexibility of the leading bulk RNA-seq DE methods and a subtle indictment of the land rush of new scRNA-seq methods that were published without adequate comparison to existing bulk RNA-seq methods. 12.1.5 Models of single-cell RNA-seq data The most common model of RNASeq data is the negative binomial model: set.seed(1) hist( rnbinom( 1000, mu = 10, size = 100), col = &quot;grey50&quot;, xlab = &quot;Read Counts&quot;, main = &quot;Negative Binomial&quot; ) Figure 12.2: Negative Binomial distribution of read counts for a single gene across 1000 cells Mean: \\(\\mu = mu\\) Variance: \\(\\sigma^2 = mu + mu^2/size\\) It is parameterized by the mean expression (mu) and the dispersion (size), which is inversely related to the variance. The negative binomial model fits bulk RNA-seq data very well and it is used for most statistical methods designed for such data. In addition, it has been show to fit the distribution of molecule counts obtained from data tagged by unique molecular identifiers (UMIs) quite well (Grun et al. 2014, Islam et al. 2011). However, a raw negative binomial model does not necessarily fit full-length transcript data as well due to the high dropout rates relative to the non-zero read counts. For this type of data a variety of zero-inflated negative binomial models have been proposed (e.g. MAST, SCDE). d &lt;- 0.5; counts &lt;- rnbinom( 1000, mu = 10, size = 100 ) counts[runif(1000) &lt; d] &lt;- 0 hist( counts, col = &quot;grey50&quot;, xlab = &quot;Read Counts&quot;, main = &quot;Zero-inflated NB&quot; ) Figure 12.3: Zero-inflated Negative Binomial distribution Mean: \\(\\mu = mu \\cdot (1 - d)\\) Variance: \\(\\sigma^2 = \\mu \\cdot (1-d) \\cdot (1 + d \\cdot \\mu + \\mu / size)\\) These models introduce a new parameter \\(d\\), for the dropout rate, to the negative binomial model. As we saw in Chapter 19, the dropout rate of a gene is strongly correlated with the mean expression of the gene. Different zero-inflated negative binomial models use different relationships between mu and d and some may fit \\(\\mu\\) and \\(d\\) to the expression of each gene independently. Finally, several methods use a Poisson-Beta distribution which is based on a mechanistic model of transcriptional bursting. There is strong experimental support for this model (Kim and Marioni, 2013) and it provides a good fit to scRNA-seq data but it is less easy to use than the negative-binomial models and much less existing methods upon which to build than the negative binomial model. a &lt;- 0.1 b &lt;- 0.1 g &lt;- 100 lambdas &lt;- rbeta(1000, a, b) counts &lt;- sapply(g*lambdas, function(l) {rpois(1, lambda = l)}) hist( counts, col = &quot;grey50&quot;, xlab = &quot;Read Counts&quot;, main = &quot;Poisson-Beta&quot; ) Mean: \\(\\mu = g \\cdot a / (a + b)\\) Variance: \\(\\sigma^2 = g^2 \\cdot a \\cdot b/((a + b + 1) \\cdot (a + b)^2)\\) This model uses three parameters: \\(a\\) the rate of activation of transcription; \\(b\\) the rate of inhibition of transcription; and \\(g\\) the rate of transcript production while transcription is active at the locus. Differential expression methods may test each of the parameters for differences across groups or only one (often \\(g\\)). All of these models may be further expanded to explicitly account for other sources of gene expression differences such as batch-effect or library depth depending on the particular DE algorithm. Exercise: Vary the parameters of each distribution to explore how they affect the distribution of gene expression. How similar are the Poisson-Beta and Negative Binomial models? 12.2 DE in a real dataset library(scRNA.seq.funcs) library(edgeR) library(limma) library(MAST) library(ROCR) set.seed(1) 12.2.1 Introduction To test different single-cell differential expression methods we will be using the Blischak dataset from Chapters 7-17. For this experiment bulk RNA-seq data for each cell-line was generated in addition to single-cell data. We will use the differentially expressed genes identified using standard methods on the respective bulk data as the ground truth for evaluating the accuracy of each single-cell method. To save time we have pre-computed these for you. You can run the commands below to load these data. DE &lt;- read.table(&quot;data/tung/TPs.txt&quot;) notDE &lt;- read.table(&quot;data/tung/TNs.txt&quot;) GroundTruth &lt;- list( DE = as.character(unlist(DE)), notDE = as.character(unlist(notDE)) ) This ground truth has been produce for the comparison of individual NA19101 to NA19239. Now load the respective single-cell data: molecules &lt;- read.table(&quot;data/tung/molecules.txt&quot;, sep = &quot;\\t&quot;) anno &lt;- read.table(&quot;data/tung/annotation.txt&quot;, sep = &quot;\\t&quot;, header = TRUE) keep &lt;- anno[,1] == &quot;NA19101&quot; | anno[,1] == &quot;NA19239&quot; data &lt;- molecules[,keep] group &lt;- anno[keep,1] batch &lt;- anno[keep,4] # remove genes that aren&#39;t expressed in at least 6 cells gkeep &lt;- rowSums(data &gt; 0) &gt; 5; counts_mat &lt;- as.matrix(data[gkeep,]) # Library size normalization lib_size &lt;- colSums(counts_mat) norm &lt;- t(t(counts_mat)/lib_size * median(lib_size)) # Variant of CPM for datasets with library sizes of fewer than 1 mil molecules Now we will compare various single-cell DE methods. We will focus on methods that performed well in Soneson and Robinson’s (Soneson and Robinson 2018) detailed comparison of differential expression methods for single-cell data. Note that we will only be running methods which are available as R-packages and run relatively quickly. 12.2.2 Kolmogorov-Smirnov test The types of test that are easiest to work with are non-parametric ones. The most commonly used non-parametric test is the Kolmogorov-Smirnov test (KS-test) and we can use it to compare the distributions for each gene in the two individuals. The KS-test quantifies the distance between the empirical cummulative distributions of the expression of each gene in each of the two populations. It is sensitive to changes in mean experession and changes in variability. However it assumes data is continuous and may perform poorly when data contains a large number of identical values (eg. zeros). Another issue with the KS-test is that it can be very sensitive for large sample sizes and thus it may end up as significant even though the magnitude of the difference is very small. Figure 12.4: Illustration of the two-sample Kolmogorov–Smirnov statistic. Red and blue lines each correspond to an empirical distribution function, and the black arrow is the two-sample KS statistic. (taken from here) Now run the test: pVals &lt;- apply( norm, 1, function(x) { ks.test( x[group == &quot;NA19101&quot;], x[group == &quot;NA19239&quot;] )$p.value } ) # multiple testing correction pVals &lt;- p.adjust(pVals, method = &quot;fdr&quot;) This code “applies” the function to each row (specified by 1) of the expression matrix, data. In the function we are returning just the p.value from the ks.test output. We can now consider how many of the ground truth positive and negative DE genes are detected by the KS-test: 12.2.2.1 Evaluating Accuracy sigDE &lt;- names(pVals)[pVals &lt; 0.05] length(sigDE) ## [1] 5095 # Number of KS-DE genes sum(GroundTruth$DE %in% sigDE) ## [1] 792 # Number of KS-DE genes that are true DE genes sum(GroundTruth$notDE %in% sigDE) ## [1] 3190 # Number of KS-DE genes that are truly not-DE As you can see many more of our ground truth negative genes were identified as DE by the KS-test (false positives) than ground truth positive genes (true positives), however this may be due to the larger number of notDE genes thus we typically normalize these counts as the True positive rate (TPR), TP/(TP + FN), and false discovery rate (FPR), FP/(FP+TP). tp &lt;- sum(GroundTruth$DE %in% sigDE) fp &lt;- sum(GroundTruth$notDE %in% sigDE) tn &lt;- sum(GroundTruth$notDE %in% names(pVals)[pVals &gt;= 0.05]) fn &lt;- sum(GroundTruth$DE %in% names(pVals)[pVals &gt;= 0.05]) tpr &lt;- tp/(tp + fn) fdr &lt;- fp/(fp + tp) cat(c(tpr, fdr), &quot;\\n&quot;) ## 0.7346939 0.801105 Now we can see the TPR is OK, but the FDR is very high— much higher than we would normally deem acceptable. So far we’ve only evaluated the performance at a single significance threshold. Often it is informative to vary the threshold and evaluate performance across a range of values. This is then plotted as a receiver-operating-characteristic curve (ROC) and a general accuracy statistic can be calculated as the area under this curve (AUC). We will use the ROCR package to facilitate this plotting. # Only consider genes for which we know the ground truth pVals &lt;- pVals[names(pVals) %in% GroundTruth$DE | names(pVals) %in% GroundTruth$notDE] truth &lt;- rep(1, times = length(pVals)); truth[names(pVals) %in% GroundTruth$DE] = 0; pred &lt;- ROCR::prediction(pVals, truth) perf &lt;- ROCR::performance(pred, &quot;tpr&quot;, &quot;fpr&quot;) ROCR::plot(perf) Figure 12.5: ROC curve for KS-test. aucObj &lt;- ROCR::performance(pred, &quot;auc&quot;) aucObj@y.values[[1]] # AUC ## [1] 0.7954796 Finally to facilitate the comparisons of other DE methods let’s put this code into a function so we don’t need to repeat it: DE_Quality_AUC &lt;- function(pVals, plot=TRUE) { pVals &lt;- pVals[names(pVals) %in% GroundTruth$DE | names(pVals) %in% GroundTruth$notDE] truth &lt;- rep(1, times = length(pVals)); truth[names(pVals) %in% GroundTruth$DE] = 0; pred &lt;- ROCR::prediction(pVals, truth) perf &lt;- ROCR::performance(pred, &quot;tpr&quot;, &quot;fpr&quot;) if (plot) ROCR::plot(perf) aucObj &lt;- ROCR::performance(pred, &quot;auc&quot;) return(aucObj@y.values[[1]]) } 12.2.3 Wilcoxon/Mann-Whitney-U Test The Wilcoxon-rank-sum test is another non-parametric test, but tests specifically if values in one group are greater/less than the values in the other group. Thus it is often considered a test for difference in median expression between two groups; whereas the KS-test is sensitive to any change in distribution of expression values. pVals &lt;- apply( norm, 1, function(x) { wilcox.test( x[group == &quot;NA19101&quot;], x[group == &quot;NA19239&quot;] )$p.value } ) # multiple testing correction pVals &lt;- p.adjust(pVals, method = &quot;fdr&quot;) DE_Quality_AUC(pVals) Figure 12.6: ROC curve for Wilcox test. ## [1] 0.8320326 12.2.4 edgeR We’ve already used edgeR for differential expression in Chapter ??. edgeR is based on a negative binomial model of gene expression and uses a generalized linear model (GLM) framework, the enables us to include other factors such as batch to the model. edgeR, like many DE methods work best with pre-filtering of very lowly experessed genes. Here, we will keep genes with non-zero expression in at least 30 cells. The first steps for this analysis then involve keep_gene &lt;- (rowSums(counts_mat &gt; 0) &gt; 29.5 &amp; rowMeans(counts_mat) &gt; 0.2) table(keep_gene) ## keep_gene ## FALSE TRUE ## 4661 11365 dge &lt;- DGEList( counts = counts_mat[keep_gene,], norm.factors = rep(1, length(counts_mat[1,])), group = group ) group_edgeR &lt;- factor(group) design &lt;- model.matrix(~ group_edgeR) dge &lt;- calcNormFactors(dge, method = &quot;TMM&quot;) summary(dge$samples$norm.factors) ## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.8250 0.9367 0.9584 1.0062 1.0536 1.9618 dge &lt;- estimateDisp(dge, design = design) plotBCV(dge) Figure 12.7: Biological coefficient of variation plot for edgeR. Next we fit a negative binomial quasi-likelihood model for differential expression. fit &lt;- glmQLFit(dge, design) res &lt;- glmQLFTest(fit) pVals &lt;- res$table[,4] names(pVals) &lt;- rownames(res$table) pVals &lt;- p.adjust(pVals, method = &quot;fdr&quot;) DE_Quality_AUC(pVals) Figure 12.8: ROC curve for edgeR. ## [1] 0.8557514 12.2.5 MAST MAST is based on a zero-inflated negative binomial model. It tests for differential expression using a hurdle model to combine tests of discrete (0 vs not zero) and continuous (non-zero values) aspects of gene expression. Again this uses a linear modelling framework to enable complex models to be considered. log_counts &lt;- log2(counts_mat + 1) fData &lt;- data.frame(names = rownames(log_counts)) rownames(fData) &lt;- rownames(log_counts); cData &lt;- data.frame(cond = group) rownames(cData) &lt;- colnames(log_counts) obj &lt;- FromMatrix(as.matrix(log_counts), cData, fData) colData(obj)$cngeneson &lt;- scale(colSums(assay(obj) &gt; 0)) cond &lt;- factor(colData(obj)$cond) # Model expression as function of condition &amp; number of detected genes zlmCond &lt;- zlm(~ cond + cngeneson, obj) summaryCond &lt;- summary(zlmCond, doLRT = &quot;condNA19101&quot;) summaryDt &lt;- summaryCond$datatable summaryDt &lt;- as.data.frame(summaryDt) pVals &lt;- unlist(summaryDt[summaryDt$component == &quot;H&quot;,4]) # H = hurdle model names(pVals) &lt;- unlist(summaryDt[summaryDt$component == &quot;H&quot;,1]) pVals &lt;- p.adjust(pVals, method = &quot;fdr&quot;) DE_Quality_AUC(pVals) Figure 12.9: ROC curve for MAST. ## [1] 0.8284046 12.2.6 limma The limma package implements Empirical Bayes linear models for identifying differentially expressed genes. Originally developed more microarray data, the extended limma-voom method has proven to be a very fast and accurate DE method for bulk RNA-seq data. With its speed and flexibility, limma also performed well for DE analysis of scRNA-seq data in the benchmarking study mentioned earlier. Here we demonstrate how to use limma for DE analysis on these data. library(limma) ## we can use the same design matrix structure as for edgeR design &lt;- model.matrix(~ group_edgeR) colnames(design) ## [1] &quot;(Intercept)&quot; &quot;group_edgeRNA19239&quot; ## apply the same filtering of low-abundance genes as for edgeR keep_gene &lt;- (rowSums(counts_mat &gt; 0) &gt; 29.5 &amp; rowMeans(counts_mat) &gt; 0.2) summary(keep_gene) ## Mode FALSE TRUE ## logical 4661 11365 y &lt;- counts_mat[keep_gene, ] v &lt;- voom(y, design) fit &lt;- lmFit(v, design) res &lt;- eBayes(fit) topTable(res) ## logFC AveExpr t P.Value adj.P.Val ## ENSG00000106153 -4.3894938 5.207732 -61.83776 1.048992e-268 1.192180e-264 ## ENSG00000022556 -3.6419064 5.368174 -54.24475 1.048481e-238 5.957992e-235 ## ENSG00000135549 -2.6370360 4.505792 -37.20177 1.129436e-160 4.278680e-157 ## ENSG00000185088 -1.2860563 8.418585 -31.33983 2.410684e-130 6.849355e-127 ## ENSG00000170906 0.7971144 8.977602 30.90797 4.732891e-128 1.075786e-124 ## ENSG00000184674 -2.1768653 4.129290 -30.73100 4.137245e-127 7.836631e-124 ## ENSG00000198825 2.1316479 5.229765 28.20805 1.412125e-113 2.292685e-110 ## ENSG00000138035 -1.5188509 6.763545 -27.65932 1.305792e-110 1.855040e-107 ## ENSG00000164434 1.8715563 5.782091 24.79885 4.465275e-95 5.638650e-92 ## ENSG00000088305 -0.7511259 9.394709 -24.69619 1.615578e-94 1.836104e-91 ## B ## ENSG00000106153 599.5015 ## ENSG00000022556 531.5858 ## ENSG00000135549 354.5934 ## ENSG00000185088 287.1727 ## ENSG00000170906 281.9116 ## ENSG00000184674 278.2672 ## ENSG00000198825 247.7511 ## ENSG00000138035 241.5441 ## ENSG00000164434 205.7273 ## ENSG00000088305 204.7427 pVals &lt;- res$p.value names(pVals) &lt;- rownames(y) pVals &lt;- p.adjust(pVals, method = &quot;fdr&quot;) DE_Quality_AUC(pVals) ## [1] 0.4984728 Q: How does limma compare on this dataset? 12.2.7 Pseudobulk In this approach, we sum counts across cells from the same condition (here we will use batch within individual) to create “pseudobulk” samples that we will analyse for DE the way we would analyse bulk RNA-seq data. Research continues on benchmarking pseudobulk and single-cell DE approaches, but the suggestion from certain (we think reliable) sources is that pseudobulk approaches are more robust and accurate (and typically very fast) in a range of settings where we’re interested in differences between groups or conditions rather than individual cells. summed &lt;- scater::sumCountsAcrossCells(counts_mat, factor(batch)) head(summed) ## NA19101.r1 NA19101.r2 NA19101.r3 NA19239.r1 NA19239.r2 ## ENSG00000237683 32 23 28 17 33 ## ENSG00000187634 1 2 7 5 0 ## ENSG00000188976 206 236 204 237 188 ## ENSG00000187961 17 32 23 19 19 ## ENSG00000187583 2 1 0 1 4 ## ENSG00000187642 0 1 0 3 2 ## NA19239.r3 ## ENSG00000237683 15 ## ENSG00000187634 7 ## ENSG00000188976 391 ## ENSG00000187961 25 ## ENSG00000187583 1 ## ENSG00000187642 2 y &lt;- DGEList(summed) y &lt;- y[aveLogCPM(y) &gt; 1,] y &lt;- calcNormFactors(y) individ &lt;- gsub(&quot;\\\\.r[123]&quot;, &quot;&quot;, colnames(summed)) design &lt;- model.matrix(~individ) head(design) ## (Intercept) individNA19239 ## 1 1 0 ## 2 1 0 ## 3 1 0 ## 4 1 1 ## 5 1 1 ## 6 1 1 y &lt;- estimateDisp(y, design) summary(y$trended.dispersion) ## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.01609 0.02518 0.03734 0.05450 0.07362 0.14888 plotBCV(y) fit &lt;- glmQLFit(y) res &lt;- glmQLFTest(fit) summary(decideTests(res)) ## individNA19239 ## Down 290 ## NotSig 11973 ## Up 155 pVals &lt;- res$table[,4] names(pVals) &lt;- rownames(res$table) pVals &lt;- p.adjust(pVals, method = &quot;fdr&quot;) DE_Quality_AUC(pVals) ## [1] 0.8685502 Q: What do you make of these pseudobulk results? What are the pros and cons of this approach? 12.2.8 Reflection What is your overall assessment of the DE methods presented in this chapter? When might you choose to use one method in preference to others? What caveats or potential problems can you think of with conducting DE testing using approaches like those presented here? 12.2.9 sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] ROCR_1.0-7 gplots_3.0.1.1 ## [3] MAST_1.10.0 SingleCellExperiment_1.6.0 ## [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 ## [7] BiocParallel_1.18.1 matrixStats_0.55.0 ## [9] Biobase_2.44.0 GenomicRanges_1.36.1 ## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3 ## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0 ## [15] edgeR_3.26.8 limma_3.40.6 ## [17] scRNA.seq.funcs_0.1.0 ## ## loaded via a namespace (and not attached): ## [1] nlme_3.1-139 bitops_1.0-6 ## [3] progress_1.2.2 tools_3.6.0 ## [5] backports_1.1.4 R6_2.4.0 ## [7] irlba_2.3.3 vipor_0.4.5 ## [9] KernSmooth_2.23-15 hypergeo_1.2-13 ## [11] lazyeval_0.2.2 colorspace_1.4-1 ## [13] gridExtra_2.3 tidyselect_0.2.5 ## [15] prettyunits_1.0.2 moments_0.14 ## [17] compiler_3.6.0 orthopolynom_1.0-5 ## [19] BiocNeighbors_1.2.0 bookdown_0.13 ## [21] caTools_1.17.1.2 scales_1.0.0 ## [23] blme_1.0-4 stringr_1.4.0 ## [25] digest_0.6.21 minqa_1.2.4 ## [27] rmarkdown_1.15 XVector_0.24.0 ## [29] scater_1.12.2 pkgconfig_2.0.3 ## [31] htmltools_0.3.6 lme4_1.1-21 ## [33] highr_0.8 rlang_0.4.0 ## [35] DelayedMatrixStats_1.6.1 gtools_3.8.1 ## [37] dplyr_0.8.3 RCurl_1.95-4.12 ## [39] magrittr_1.5 BiocSingular_1.0.0 ## [41] GenomeInfoDbData_1.2.1 Matrix_1.2-17 ## [43] Rcpp_1.0.2 ggbeeswarm_0.6.0 ## [45] munsell_0.5.0 viridis_0.5.1 ## [47] abind_1.4-5 stringi_1.4.3 ## [49] yaml_2.2.0 MASS_7.3-51.1 ## [51] zlibbioc_1.30.0 Rtsne_0.15 ## [53] plyr_1.8.4 grid_3.6.0 ## [55] gdata_2.18.0 crayon_1.3.4 ## [57] contfrac_1.1-12 lattice_0.20-38 ## [59] splines_3.6.0 hms_0.5.1 ## [61] locfit_1.5-9.1 zeallot_0.1.0 ## [63] knitr_1.25 pillar_1.4.2 ## [65] boot_1.3-20 reshape2_1.4.3 ## [67] glue_1.3.1 evaluate_0.14 ## [69] data.table_1.12.2 deSolve_1.24 ## [71] vctrs_0.2.0 nloptr_1.2.1 ## [73] gtable_0.3.0 purrr_0.3.2 ## [75] assertthat_0.2.1 ggplot2_3.2.1 ## [77] xfun_0.9 rsvd_1.0.2 ## [79] viridisLite_0.3.0 tibble_2.1.3 ## [81] elliptic_1.4-0 beeswarm_0.2.3 ## [83] statmod_1.4.32 References "],
 ["imputation.html", "13 Imputation", " 13 Imputation library(scImpute) library(SC3) library(scater) library(SingleCellExperiment) library(mclust) library(DrImpute) set.seed(1234567) As discussed previously, one of the main challenges when analyzing scRNA-seq data is the presence of zeros, or dropouts. The dropouts are assumed to have arisen for three possible reasons: The gene was not expressed in the cell and hence there are no transcripts to sequence The gene was expressed, but for some reason the transcripts were lost somewhere prior to sequencing The gene was expressed and transcripts were captured and turned into cDNA, but the sequencing depth was not sufficient to produce any reads. Thus, dropouts could be result of experimental shortcomings, and if this is the case then we would like to provide computational corrections. One possible solution is to impute the dropouts in the expression matrix. To be able to impute gene expression values, one must have an underlying model. However, since we do not know which dropout events are technical artefacts and which correspond to the transcript being truly absent, imputation is a difficult challenge and prone to creating false-positive results in downstream analysis. There are many different imputation methods available we will consider three fast, published methods: DrImpute and scImpute (Li and Li 2017). DrImpute and scImpute both use a model to determine which zeros are technical and impute only those values. Both use clustering to identify a group of cells that are assumed to have homogenous expression. DrImpute imputes all values that are not consistently zero in all cells of a cluster. Whereas, scImpute uses a zero-inflated normal distribution fit to log-normalized expression values and imputed all inflated zeros. 13.0.1 scImpute To test scImpute, we use the default parameters and we apply it to the Deng dataset that we have worked with before. scImpute takes a .csv or .txt file as an input: deng &lt;- readRDS(&quot;data/deng/deng-reads.rds&quot;) write.csv(counts(deng), &quot;deng.csv&quot;) scimpute( count_path = &quot;deng.csv&quot;, infile = &quot;csv&quot;, outfile = &quot;txt&quot;, out_dir = &quot;./&quot;, Kcluster = 10, ncores = 2 ) ## [1] &quot;reading in raw count matrix ...&quot; ## [1] &quot;number of genes in raw count matrix 22431&quot; ## [1] &quot;number of cells in raw count matrix 268&quot; ## [1] &quot;reading finished!&quot; ## [1] &quot;imputation starts ...&quot; ## [1] &quot;searching candidate neighbors ... &quot; ## [1] &quot;inferring cell similarities ...&quot; ## [1] &quot;dimension reduction ...&quot; ## [1] &quot;calculating cell distances ...&quot; ## [1] &quot;cluster sizes:&quot; ## [1] 84 9 72 5 8 12 8 22 24 14 ## [1] &quot;estimating dropout probability for type 1 ...&quot; ## [1] &quot;searching for valid genes ...&quot; ## [1] &quot;imputing dropout values for type 1 ...&quot; ## [1] &quot;estimating dropout probability for type 2 ...&quot; ## [1] &quot;searching for valid genes ...&quot; ## [1] &quot;imputing dropout values for type 2 ...&quot; ## [1] &quot;estimating dropout probability for type 3 ...&quot; ## [1] &quot;searching for valid genes ...&quot; ## [1] &quot;imputing dropout values for type 3 ...&quot; ## [1] &quot;estimating dropout probability for type 4 ...&quot; ## [1] &quot;searching for valid genes ...&quot; ## [1] &quot;imputing dropout values for type 4 ...&quot; ## [1] &quot;estimating dropout probability for type 5 ...&quot; ## [1] &quot;searching for valid genes ...&quot; ## [1] &quot;imputing dropout values for type 5 ...&quot; ## [1] &quot;estimating dropout probability for type 6 ...&quot; ## [1] &quot;searching for valid genes ...&quot; ## [1] &quot;imputing dropout values for type 6 ...&quot; ## [1] &quot;estimating dropout probability for type 7 ...&quot; ## [1] &quot;searching for valid genes ...&quot; ## [1] &quot;imputing dropout values for type 7 ...&quot; ## [1] &quot;estimating dropout probability for type 8 ...&quot; ## [1] &quot;searching for valid genes ...&quot; ## [1] &quot;imputing dropout values for type 8 ...&quot; ## [1] &quot;estimating dropout probability for type 9 ...&quot; ## [1] &quot;searching for valid genes ...&quot; ## [1] &quot;imputing dropout values for type 9 ...&quot; ## [1] &quot;estimating dropout probability for type 10 ...&quot; ## [1] &quot;searching for valid genes ...&quot; ## [1] &quot;imputing dropout values for type 10 ...&quot; ## [1] &quot;writing imputed count matrix ...&quot; ## [1] 17 18 88 111 126 177 186 229 244 247 Now we can compare the results with original data by considering a PCA plot res &lt;- read.table(&quot;scimpute_count.txt&quot;) colnames(res) &lt;- NULL res &lt;- SingleCellExperiment( assays = list(logcounts = log2(as.matrix(res) + 1)), colData = colData(deng) ) rowData(res)$feature_symbol &lt;- rowData(deng)$feature_symbol plotPCA( res, colour_by = &quot;cell_type2&quot; ) Compare this result to the original data in Chapter 10.2. What are the most significant differences? We can examine the expression of specific genes to directly see the effect of imputation on the expression distribution. plotExpression(res, c(&quot;Sox2&quot;, &quot;Eomes&quot;, &quot;Zscan4d&quot;, &quot;Fgf4&quot;)) plotExpression(deng, c(&quot;Sox2&quot;, &quot;Eomes&quot;, &quot;Zscan4d&quot;, &quot;Fgf4&quot;)) To evaluate the impact of the imputation, we use SC3 to cluster the imputed matrix res &lt;- sc3_estimate_k(res) metadata(res)$sc3$k_estimation ## [1] 6 res &lt;- sc3(res, ks = 10, n_cores = 1, gene_filter = FALSE) adjustedRandIndex(colData(deng)$cell_type2, colData(res)$sc3_10_clusters) ## [1] 0.5062983 plotPCA( res, colour_by = &quot;sc3_10_clusters&quot; ) Exercise: Based on the PCA and the clustering results, do you think that imputation using scImpute is a good idea for the Deng dataset? 13.0.2 DrImpute We can do the same for DrImpute. DrImpute runs on a log-normalized expression matrix directly in R, we generate this matrix using scater, then run DrImpute. Unlike scImpute, DrImpute considers the consensus imputation across a range of ks using two differ correlation distances: deng &lt;- normalize(deng) res &lt;- DrImpute(logcounts(deng), ks=8:12) ## Calculating Spearman distance. ## Calculating Pearson distance. ## Clustering for k : 8 ## Clustering for k : 9 ## Clustering for k : 10 ## Clustering for k : 11 ## Clustering for k : 12 ## cls object have 10 number of clustering sets. ## ## ## Zero percentage : ## Before impute : 60 percent. ## After impute : 17 percent. ## 43 percent of zeros are imputed. colnames(res) &lt;- colnames(deng) rownames(res) &lt;- rownames(deng) res &lt;- SingleCellExperiment( assays = list(logcounts = as.matrix(res)), colData = colData(deng) ) rowData(res)$feature_symbol &lt;- rowData(deng)$feature_symbol plotPCA( res, colour_by = &quot;cell_type2&quot; ) plotExpression(res, c(&quot;Sox2&quot;, &quot;Eomes&quot;, &quot;Zscan4d&quot;, &quot;Fgf4&quot;)) Exercise: Check the sc3 clustering of the DrImpute matrix, do you think that imputation using DrImpute is a good idea for the Deng dataset? Exercise: What is the difference between scImpute and DrImpute based on the PCA and clustering analysis? Which one do you think is best to use? 13.0.3 sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] stats4 parallel stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] DrImpute_1.0 mclust_5.4.5 ## [3] scater_1.12.2 ggplot2_3.2.1 ## [5] SingleCellExperiment_1.6.0 SummarizedExperiment_1.14.1 ## [7] DelayedArray_0.10.0 BiocParallel_1.18.1 ## [9] matrixStats_0.55.0 Biobase_2.44.0 ## [11] GenomicRanges_1.36.1 GenomeInfoDb_1.20.0 ## [13] IRanges_2.18.3 S4Vectors_0.22.1 ## [15] BiocGenerics_0.30.0 SC3_1.12.0 ## [17] scImpute_0.0.9 doParallel_1.0.15 ## [19] iterators_1.0.12 foreach_1.4.7 ## [21] penalized_0.9-51 survival_2.43-3 ## ## loaded via a namespace (and not attached): ## [1] bitops_1.0-6 RColorBrewer_1.1-2 ## [3] tools_3.6.0 doRNG_1.7.1 ## [5] irlba_2.3.3 R6_2.4.0 ## [7] vipor_0.4.5 KernSmooth_2.23-15 ## [9] lazyeval_0.2.2 colorspace_1.4-1 ## [11] withr_2.1.2 gridExtra_2.3 ## [13] tidyselect_0.2.5 compiler_3.6.0 ## [15] BiocNeighbors_1.2.0 pkgmaker_0.27 ## [17] labeling_0.3 bookdown_0.13 ## [19] caTools_1.17.1.2 scales_1.0.0 ## [21] DEoptimR_1.0-8 mvtnorm_1.0-11 ## [23] robustbase_0.93-5 stringr_1.4.0 ## [25] digest_0.6.21 rmarkdown_1.15 ## [27] XVector_0.24.0 rrcov_1.4-7 ## [29] pkgconfig_2.0.3 htmltools_0.3.6 ## [31] bibtex_0.4.2 WriteXLS_5.0.0 ## [33] rlang_0.4.0 DelayedMatrixStats_1.6.1 ## [35] shiny_1.3.2 gtools_3.8.1 ## [37] dplyr_0.8.3 BiocSingular_1.0.0 ## [39] RCurl_1.95-4.12 magrittr_1.5 ## [41] GenomeInfoDbData_1.2.1 Matrix_1.2-17 ## [43] ggbeeswarm_0.6.0 Rcpp_1.0.2 ## [45] munsell_0.5.0 viridis_0.5.1 ## [47] stringi_1.4.3 yaml_2.2.0 ## [49] zlibbioc_1.30.0 gplots_3.0.1.1 ## [51] grid_3.6.0 gdata_2.18.0 ## [53] promises_1.0.1 crayon_1.3.4 ## [55] lattice_0.20-38 cowplot_1.0.0 ## [57] splines_3.6.0 knitr_1.25 ## [59] pillar_1.4.2 rngtools_1.4 ## [61] codetools_0.2-16 glue_1.3.1 ## [63] evaluate_0.14 httpuv_1.5.2 ## [65] gtable_0.3.0 purrr_0.3.2 ## [67] kernlab_0.9-27 assertthat_0.2.1 ## [69] xfun_0.9 mime_0.7 ## [71] rsvd_1.0.2 xtable_1.8-4 ## [73] e1071_1.7-2 later_0.8.0 ## [75] viridisLite_0.3.0 class_7.3-15 ## [77] pcaPP_1.9-73 tibble_2.1.3 ## [79] pheatmap_1.0.12 beeswarm_0.2.3 ## [81] registry_0.5-1 cluster_2.1.0 ## [83] ROCR_1.0-7 References "],
 ["comparing-and-combining-scrna-seq-datasets.html", "14 Comparing and combining scRNA-seq datasets 14.1 Introduction 14.2 Datasets 14.3 Projecting cells onto annotated cell-types (scmap) 14.4 Cell-to-Cell mapping 14.5 mnnCorrect 14.6 “Anchor” integration (Seurat) 14.7 Search scRNA-Seq data", " 14 Comparing and combining scRNA-seq datasets library(scater) library(SingleCellExperiment) 14.1 Introduction As more and more scRNA-seq datasets become available, carrying merged_seurat comparisons between them is key. There are two main approaches to comparing scRNASeq datasets. The first approach is “label-centric” which is focused on trying to identify equivalent cell-types/states across datasets by comparing individual cells or groups of cells. The other approach is “cross-dataset normalization” which attempts to computationally remove experiment-specific technical/biological effects so that data from multiple experiments can be combined and jointly analyzed. The label-centric approach can be used with dataset with high-confidence cell-annotations, e.g. the Human Cell Atlas (HCA) (Regev et al. 2017) or the Tabula Muris (???) once they are completed, to project cells or clusters from a new sample onto this reference to consider tissue composition and/or identify cells with novel/unknown identity. Conceptually, such projections are similar to the popular BLAST method (Altschul et al. 1990), which makes it possible to quickly find the closest match in a database for a newly identified nucleotide or amino acid sequence. The label-centric approach can also be used to compare datasets of similar biological origin collected by different labs to ensure that the annotation and the analysis is consistent. Figure 2.4: Label-centric dataset comparison can be used to compare the annotations of two different samples. Figure 2.5: Label-centric dataset comparison can project cells from a new experiment onto an annotated reference. The cross-dataset normalization approach can also be used to compare datasets of similar biological origin, unlike the label-centric approach it enables the join analysis of multiple datasets to facilitate the identification of rare cell-types which may to too sparsely sampled in each individual dataset to be reliably detected. However, cross-dataset normalization is not applicable to very large and diverse references since it assumes a significant portion of the biological variablility in each of the datasets overlaps with others. Figure 2.6: Cross-dataset normalization enables joint-analysis of 2+ scRNASeq datasets. 14.2 Datasets We will running these methods on two human pancreas datasets: (Muraro et al. 2016) and (Segerstolpe et al. 2016). Since the pancreas has been widely studied, these datasets are well annotated. muraro &lt;- readRDS(&quot;data/pancreas/muraro.rds&quot;) segerstolpe &lt;- readRDS(&quot;data/pancreas/segerstolpe.rds&quot;) This data has already been formatted for scmap. Cell type labels must be stored in the cell_type1 column of the colData slots, and gene ids that are consistent across both datasets must be stored in the feature_symbol column of the rowData slots. First, lets check our gene-ids match across both datasets: sum(rowData(muraro)$feature_symbol %in% rowData(segerstolpe)$feature_symbol)/nrow(muraro) ## [1] 0.9599519 sum(rowData(segerstolpe)$feature_symbol %in% rowData(muraro)$feature_symbol)/nrow(segerstolpe) ## [1] 0.719334 Here we can see that 96% of the genes present in muraro match genes in segerstople and 72% of genes in segerstolpe are match genes in muraro. This is as expected because the segerstolpe dataset was more deeply sequenced than the muraro dataset. However, it highlights some of the difficulties in comparing scRNASeq datasets. We can confirm this by checking the overall size of these two datasets. dim(muraro) ## [1] 19127 2126 dim(segerstolpe) ## [1] 25525 3514 In addition, we can check the cell-type annotations for each of these dataset using the command below: summary(factor(colData(muraro)$cell_type1)) ## acinar alpha beta delta ductal endothelial ## 219 812 448 193 245 21 ## epsilon gamma mesenchymal unclear ## 3 101 80 4 summary(factor(colData(segerstolpe)$cell_type1)) ## acinar alpha beta ## 185 886 270 ## co-expression delta ductal ## 39 114 386 ## endothelial epsilon gamma ## 16 7 197 ## mast MHC class II not applicable ## 7 5 1305 ## PSC unclassified unclassified endocrine ## 54 2 41 Here we can see that even though both datasets considered the same biological tissue, they have been annotated with slightly different sets of cell-types. If you are familiar with pancreas biology you might recognize that the pancreatic stellate cells (PSCs) in segerstolpe are a type of mesenchymal stem cell which would fall under the “mesenchymal” type in muraro. However, it isn’t clear whether these two annotations should be considered synonymous or not. We can use label-centric comparison methods to determine if these two cell-type annotations are indeed equivalent. Alternatively, we might be interested in understanding the function of those cells that were “unclassified endocrine” or were deemed too poor quality (“not applicable”) for the original clustering in each dataset by leveraging in formation across datasets. Either we could attempt to infer which of the existing annotations they most likely belong to using label-centric approaches or we could try to uncover a novel cell-type among them (or a sub-type within the existing annotations) using cross-dataset normalization. To simplify our demonstration analyses we will remove the small classes of unassigned cells, and the poor quality cells. We will retain the “unclassified endocrine” to see if any of these methods can elucidate what cell-type they belong to. segerstolpe &lt;- segerstolpe[,colData(segerstolpe)$cell_type1 != &quot;unclassified&quot;] segerstolpe &lt;- segerstolpe[,colData(segerstolpe)$cell_type1 != &quot;not applicable&quot;,] muraro &lt;- muraro[,colData(muraro)$cell_type1 != &quot;unclear&quot;] 14.3 Projecting cells onto annotated cell-types (scmap) library(scmap) set.seed(1234567) Martin Hemberg and colleagues recently developed scmap (Kiselev and Hemberg 2017)—a method for projecting cells from a scRNA-seq experiment onto the cell-types identified in other experiments. Additionally, a cloud version of scmap can be run for free, without restrictions, from http://www.hemberg-lab.cloud/scmap. 14.3.0.1 Feature Selection Once we have a SingleCellExperiment object we can run scmap. First we have to build the “index” of our reference clusters. Since we want to know whether PSCs and mesenchymal cells are synonymous we will project each dataset to the other so we will build an index for each dataset. This requires first selecting the most informative features for the reference dataset. muraro &lt;- selectFeatures(muraro, suppress_plot = FALSE) Genes highlighted with the red colour will be used in the futher analysis (projection). segerstolpe &lt;- selectFeatures(segerstolpe, suppress_plot = FALSE) From the y-axis of these plots we can see that scmap uses a dropmerged_seurat-based feature selection method. Now calculate the cell-type index: muraro &lt;- indexCluster(muraro) segerstolpe &lt;- indexCluster(segerstolpe) We can also visualize the index: heatmap(as.matrix(metadata(muraro)$scmap_cluster_index)) You may want to adjust your features using the setFeatures function if features are too heavily concentrated in only a few cell-types. In this case the dropout-based features look good so we will just them. Exercise Using the rowData of each dataset how many genes were selected as features in both datasets? What does this tell you about these datasets? Answer 14.3.1 Projecting scmap computes the distance from each cell to each cell-type in the reference index, then applies an empirically derived threshold to determine which cells are assigned to the closest reference cell-type and which are unassigned. To account for differences in sequencing depth distance is calculated using the spearman correlation and cosine distance and only cells with a consistent assignment with both distances are returned as assigned. We will project the segerstolpe dataset to muraro dataset: seger_to_muraro &lt;- scmapCluster( projection = segerstolpe, index_list = list( muraro = metadata(muraro)$scmap_cluster_index ) ) and muraro onto segerstolpe muraro_to_seger &lt;- scmapCluster( projection = muraro, index_list = list( seger = metadata(segerstolpe)$scmap_cluster_index ) ) Note that in each case we are projecting to a single dataset but that this could be extended to any number of datasets for which we have computed indices. Now lets compare the original cell-type labels with the projected labels: table(colData(muraro)$cell_type1, muraro_to_seger$scmap_cluster_labs) ## ## acinar alpha beta co-expression delta ductal endothelial ## acinar 211 0 0 0 0 0 0 ## alpha 1 763 0 18 0 2 0 ## beta 2 1 397 7 2 2 0 ## delta 0 0 2 1 173 0 0 ## ductal 7 0 0 0 0 208 0 ## endothelial 0 0 0 0 0 0 15 ## epsilon 0 0 0 0 0 0 0 ## gamma 2 0 0 0 0 0 0 ## mesenchymal 0 0 0 0 0 1 0 ## ## epsilon gamma MHC class II PSC unassigned ## acinar 0 0 0 0 8 ## alpha 0 2 0 0 26 ## beta 0 5 1 2 29 ## delta 0 0 0 0 17 ## ductal 0 0 5 3 22 ## endothelial 0 0 0 1 5 ## epsilon 3 0 0 0 0 ## gamma 0 95 0 0 4 ## mesenchymal 0 0 0 77 2 Here we can see that cell-types do map to their equivalents in segerstolpe, and importantly we see that all but one of the “mesenchymal” cells were assigned to the “PSC” class. table(colData(segerstolpe)$cell_type1, seger_to_muraro$scmap_cluster_labs) ## ## acinar alpha beta delta ductal endothelial ## acinar 181 0 0 0 4 0 ## alpha 0 869 1 0 0 0 ## beta 0 0 260 0 0 0 ## co-expression 0 7 31 0 0 0 ## delta 0 0 1 111 0 0 ## ductal 0 0 0 0 383 0 ## endothelial 0 0 0 0 0 14 ## epsilon 0 0 0 0 0 0 ## gamma 0 2 0 0 0 0 ## mast 0 0 0 0 0 0 ## MHC class II 0 0 0 0 0 0 ## PSC 0 0 1 0 0 0 ## unclassified endocrine 0 0 0 0 0 0 ## ## epsilon gamma mesenchymal unassigned ## acinar 0 0 0 0 ## alpha 0 0 0 16 ## beta 0 0 0 10 ## co-expression 0 0 0 1 ## delta 0 0 0 2 ## ductal 0 0 0 3 ## endothelial 0 0 0 2 ## epsilon 6 0 0 1 ## gamma 0 192 0 3 ## mast 0 0 0 7 ## MHC class II 0 0 0 5 ## PSC 0 0 53 0 ## unclassified endocrine 0 0 0 41 Again we see cell-types match each other and that all but one of the “PSCs” match the “mesenchymal” cells providing strong evidence that these two annotations should be considered synonymous. We can also visualize these tables using a Sankey diagram: plot(getSankey(colData(muraro)$cell_type1, muraro_to_seger$scmap_cluster_labs[,1], plot_height=400)) Exercise How many of the previously unclassified cells would be be able to assign to cell-types using scmap? Answer 14.4 Cell-to-Cell mapping scmap can also project each cell in one dataset to its approximate closest neighbouring cell in the reference dataset. This uses a highly optimized search algorithm allowing it to be scaled to very large references (in theory 100,000–millions of cells). However, this process is stochastic so we must fix the random seed to ensure we can reproduce our results. We have already performed feature selection for this dataset so we can go straight to building the index. set.seed(193047) segerstolpe &lt;- indexCell(segerstolpe) muraro &lt;- indexCell(muraro) In this case the index is a series of clusterings of each cell using different sets of features, parameters k and M are the number of clusters and the number of features used in each of these subclusterings. New cells are assigned to the nearest cluster in each subclustering to generate unique pattern of cluster assignments. We then find the cell in the reference dataset with the same or most similar pattern of cluster assignments. We can examine the cluster assignment patterns for the reference datasets using: metadata(muraro)$scmap_cell_index$subclusters[1:5,1:5] ## D28.1_1 D28.1_13 D28.1_15 D28.1_17 D28.1_2 ## [1,] 4 6 4 24 34 ## [2,] 16 27 14 11 3 ## [3,] 18 12 17 26 24 ## [4,] 44 33 41 44 12 ## [5,] 39 44 23 34 32 To project and find the w nearest neighbours we use a similar command as before: muraro_to_seger &lt;- scmapCell( projection = muraro, index_list = list( seger = metadata(segerstolpe)$scmap_cell_index ), w = 5 ) We can again look at the results: muraro_to_seger$seger[[1]][,1:5] ## D28.1_1 D28.1_13 D28.1_15 D28.1_17 D28.1_2 ## [1,] 1448 1381 1125 1834 1553 ## [2,] 1731 855 1081 1731 1437 ## [3,] 1229 2113 2104 1790 1890 ## [4,] 2015 1833 1153 1882 1593 ## [5,] 1159 1854 1731 1202 1078 This shows the column number of the 5 nearest neighbours in segerstolpe to each of the cells in muraro. We could then calculate a pseudotime estimate, branch assignment, or other cell-level data by selecting the appropriate data from the colData of the segerstolpe data set. As a demonstration we will find the cell-type of the nearest neighbour of each cell. cell_type_NN &lt;- colData(segerstolpe)$cell_type1[muraro_to_seger$seger[[1]][1,]] head(cell_type_NN) ## [1] &quot;alpha&quot; &quot;ductal&quot; &quot;alpha&quot; &quot;alpha&quot; &quot;endothelial&quot; ## [6] &quot;endothelial&quot; Exercise: How well does the cell-to-cell mapping work, in your reckoning? How many cell type annotations agree/disagree between the top nearest-neighbour cells? 14.5 mnnCorrect mnnCorrect corrects datasets to facilitate joint analysis. It order to account for differences in composition between two replicates or two different experiments it first matches invidual cells across experiments to find the overlaping biologicial structure. Using that overlap it learns which dimensions of expression correspond to the biological state and which dimensions correspond to batch/experiment effect; mnnCorrect assumes these dimensions are orthologal to each other in high dimensional expression space. Finally it removes the batch/experiment effects from the entire expression matrix to return the corrected matrix. To match individual cells to each other across datasets, mnnCorrect uses the cosine distance to avoid library-size effect then identifies mututal nearest neighbours (k determines to neighbourhood size) across datasets. Only overlaping biological groups should have mutual nearest neighbours (see panel b below). However, this assumes that k is set to approximately the size of the smallest biological group in the datasets, but a k that is too low will identify too few mutual nearest-neighbour pairs to get a good estimate of the batch effect we want to remove. Learning the biological/technical effects is done with either singular value decomposition, similar to RUV we encounters in the batch-correction section, or with principal component analysis with the opitimized irlba package, which should be faster than SVD. The parameter svd.dim specifies how many dimensions should be kept to summarize the biological structure of the data, we will set it to three as we found three major groups using Metaneighbor above. These estimates may be futher adjusted by smoothing (sigma) and/or variance adjustment (var.adj). mnnCorrect also assumes you’ve already subset your expression matricies so that they contain identical genes in the same order, so we need to ensure that is the case before proceeding with mnnCorrect. Note: The analysis presented below is relatively time and memory consuming. As such, we have not run these commands when compiling this website and therefore no output is shown here. Approach with caution if you are trying to run this analysis on a laptop! Figure 14.1: mnnCorrect batch/dataset effect correction. From Haghverdi et al. 2017 is.common &lt;- rowData(muraro)$feature_symbol %in% rowData(segerstolpe)$feature_symbol muraro &lt;- muraro[is.common,] segerstolpe &lt;- segerstolpe[match(rowData(muraro)$feature_symbol, rowData(segerstolpe)$feature_symbol),] rownames(segerstolpe) &lt;- rowData(segerstolpe)$feature_symbol rownames(muraro) &lt;- rowData(muraro)$feature_symbol identical(rownames(segerstolpe), rownames(muraro)) ## [1] TRUE library(&quot;batchelor&quot;) identical(rownames(muraro), rownames(segerstolpe)) # mnnCorrect will take several minutes to run corrected &lt;- mnnCorrect(logcounts(muraro), logcounts(segerstolpe), k=20) First let’s check that we found a sufficient number of mnn pairs, mnnCorrect returns a list of dataframe with the mnn pairs for each dataset. dim(corrected$pairs[[1]]) # muraro -&gt; others dim(corrected$pairs[[2]]) # seger -&gt; others The first and second columns contain the cell column IDs and the third column contains a number indicating which dataset/batch the column 2 cell belongs to. In our case, we are only comparing two datasets so all the mnn pairs have been assigned to the second table and the third column contains only ones head(corrected$pairs[[2]]) total_pairs &lt;- nrow(corrected$pairs[[2]]) n_unique_seger &lt;- length(unique((corrected$pairs[[2]][,1]))) n_unique_muraro &lt;- length(unique((corrected$pairs[[2]][,2]))) mnnCorrect found “r total_pairs” sets of mutual nearest-neighbours between n_unique_seger segerstolpe cells and n_unique_muraro muraro cells. This should be a sufficient number of pairs but the low number of unique cells in each dataset suggests we might not have captured the full biological signal in each dataset. Exercise Which cell-types had mnns across these datasets? Should we increase/decrease k? Answer Now we could create a combined dataset to jointly analyse these data. However, the corrected data is no longer counts and usually will contain negative expression values thus some analysis tools may no longer be appropriate. For simplicity let’s just plot a joint TSNE. library(&quot;Rtsne&quot;) joint_expression_matrix &lt;- cbind(corrected$corrected[[1]], corrected$corrected[[2]]) # Tsne will take some time to run on the full dataset joint_tsne &lt;- Rtsne(t(joint_expression_matrix[rownames(joint_expression_matrix) %in% var.genes,]), initial_dims=10, theta=0.75, check_duplicates=FALSE, max_iter=200, stop_lying_iter=50, mom_switch_iter=50) dataset_labels &lt;- factor(rep(c(&quot;m&quot;, &quot;s&quot;), times=c(ncol(muraro), ncol(segerstolpe)))) cell_type_labels &lt;- factor(c(colData(muraro)$cell_type1, colData(segerstolpe)$cell_type1)) plot(joint_tsne$Y[,1], joint_tsne$Y[,2], pch=c(16,1)[dataset_labels], col=rainbow(length(levels(cell_type_labels)))[cell_type_labels]) 14.6 “Anchor” integration (Seurat) The Seurat package originally proposed another correction method for combining multiple datasets, called CCA. However, unlike mnnCorrect it doesn’t correct the expression matrix itself directly. Instead Seurat finds a lower dimensional subspace for each dataset then corrects these subspaces. Also different from mnnCorrect, Seurat could only combine a single pair of datasets at a time with CCA. Recently, the Seurat team proposed a new method to integrate single-cell datasets using “anchors” (???). Figure 14.2: Seurat integration with anchors. Note: because Seurat uses up a lot of library space you may need to restart your R-session to load it, and the plots/output won’t be automatically generated on this page. Also the analysis presented below is relatively time and memory consuming. As such, we have not run these commands when compiling this website and therefore no output is shown here. Approach with caution if you are trying to run this analysis on a laptop! Reload the data: muraro &lt;- readRDS(&quot;data/pancreas/muraro.rds&quot;) segerstolpe &lt;- readRDS(&quot;data/pancreas/segerstolpe.rds&quot;) segerstolpe &lt;- segerstolpe[,colData(segerstolpe)$cell_type1 != &quot;unclassified&quot;] segerstolpe &lt;- segerstolpe[,colData(segerstolpe)$cell_type1 != &quot;not applicable&quot;,] muraro &lt;- muraro[,colData(muraro)$cell_type1 != &quot;unclear&quot;] is.common &lt;- rowData(muraro)$feature_symbol %in% rowData(segerstolpe)$feature_symbol muraro &lt;- muraro[is.common,] segerstolpe &lt;- segerstolpe[match(rowData(muraro)$feature_symbol, rowData(segerstolpe)$feature_symbol),] rownames(segerstolpe) &lt;- rowData(segerstolpe)$feature_symbol rownames(muraro) &lt;- rowData(muraro)$feature_symbol identical(rownames(segerstolpe), rownames(muraro)) First we will reformat our data into Seurat objects: library(&quot;Seurat&quot;) library(scater) library(SingleCellExperiment) set.seed(4719364) muraro_seurat &lt;- CreateSeuratObject(normcounts(muraro)) # raw counts aren&#39;t available for muraro muraro_seurat@meta.data[, &quot;dataset&quot;] &lt;- 1 muraro_seurat@meta.data[, &quot;celltype&quot;] &lt;- paste(&quot;m&quot;,colData(muraro)$cell_type1, sep=&quot;-&quot;) seger_seurat &lt;- CreateSeuratObject(counts(segerstolpe)) seger_seurat@meta.data[, &quot;dataset&quot;] &lt;- 2 seger_seurat@meta.data[, &quot;celltype&quot;] &lt;- paste(&quot;s&quot;,colData(segerstolpe)$cell_type1, sep=&quot;-&quot;) Next we must normalize, scale and identify highly variable genes for each dataset: muraro_seurat &lt;- NormalizeData(object=muraro_seurat) muraro_seurat &lt;- ScaleData(object=muraro_seurat) muraro_seurat &lt;- FindVariableFeatures(object=muraro_seurat) seger_seurat &lt;- NormalizeData(object=seger_seurat) seger_seurat &lt;- ScaleData(object=seger_seurat) seger_seurat &lt;- FindVariableFeatures(object=seger_seurat) Even though Seurat corrects for the relationship between dispersion and mean expression, it doesn’t use the corrected value when ranking features. Compare the results of the command below with the results in the plots above: head(VariableFeatures(muraro_seurat), 50) head(VariableFeatures(seger_seurat), 50) Next, we use the SelectIntegrationFeatures function to select with genes to use to integrate the two datasets here. seurat_list &lt;- list(muraro = muraro_seurat, seger = seger_seurat) features &lt;- SelectIntegrationFeatures(object.list = seurat_list) seurat_list &lt;- lapply(X = seurat_list, FUN = function(x) { x &lt;- ScaleData(x, features = features, verbose = FALSE) x &lt;- RunPCA(x, features = features, verbose = FALSE) }) Now we will run FindIntegrationAnchors and then IntegrateData to integrate the datasets: pancreas.anchors &lt;- FindIntegrationAnchors(object.list = seurat_list, dims = 1:30) pancreas.integrated &lt;- IntegrateData(anchorset = pancreas.anchors, dims = 1:30) To view the output of the integration we could run the following plotting commands. library(ggplot2) library(cowplot) # switch to integrated assay. The variable features of this assay are automatically # set during IntegrateData DefaultAssay(pancreas.integrated) &lt;- &quot;integrated&quot; # Run the standard workflow for visualization and clustering pancreas.integrated &lt;- ScaleData(pancreas.integrated, verbose = FALSE) pancreas.integrated &lt;- RunPCA(pancreas.integrated, npcs = 30, verbose = FALSE) pancreas.integrated &lt;- RunUMAP(pancreas.integrated, reduction = &quot;pca&quot;, dims = 1:30) p1 &lt;- DimPlot(pancreas.integrated, reduction = &quot;umap&quot;, group.by = &quot;dataset&quot;) p2 &lt;- DimPlot(pancreas.integrated, reduction = &quot;umap&quot;, group.by = &quot;celltype&quot;, label = TRUE, repel = TRUE) + NoLegend() plot_grid(p1, p2) Q: How well do you think this approach integrates these two pancreatic datasets? Advanced Exercise Use the clustering methods we previously covered on the combined datasets. Do you identify any novel cell-types? For more details on using Seurat for this type of analysis, please consult the relevant vignettes and documentation 14.7 Search scRNA-Seq data library(scfind) library(SingleCellExperiment) library(plotly) set.seed(1234567) 14.7.1 About scfind is a tool that allows one to search single cell RNA-Seq collections (Atlas) using lists of genes, e.g. searching for cells and cell-types where a specific set of genes are expressed. scfind is a Github package. Cloud implementation of scfind with a large collection of datasets is available on our website. Figure 14.3: scfind can be used to search large collection of scRNA-seq data by a list of gene IDs. 14.7.2 Dataset We will run scfind on the Tabula Muris 10X dataset. scfind also operates on SingleCellExperiment class: tm10x_heart &lt;- readRDS(&quot;data/sce/Heart_10X.rds&quot;) tm10x_heart &lt;- tm10x_heart[, !is.na(tm10x_heart$cell_type1)] tm10x_heart ## class: SingleCellExperiment ## dim: 23341 624 ## metadata(1): log.exprs.offset ## assays(3): counts normcounts logcounts ## rownames(23341): Xkr4 Rp1 ... Tdtom_transgene zsGreen_transgene ## rowData names(1): feature_symbol ## colnames(624): 10X_P7_4_AAACCTGCACGACGAA-1 ## 10X_P7_4_AAACCTGGTTCCACGG-1 ... 10X_P7_4_TTTGTCAGTCGATTGT-1 ## 10X_P7_4_TTTGTCAGTGCGCTTG-1 ## colData names(5): mouse cell_type1 tissue subtissue sex ## reducedDimNames(3): UMAP PCA TSNE ## spikeNames(0): colData(tm10x_heart) ## DataFrame with 624 rows and 5 columns ## mouse cell_type1 tissue ## &lt;factor&gt; &lt;factor&gt; &lt;factor&gt; ## 10X_P7_4_AAACCTGCACGACGAA-1 3-F-56 endothelial cell Heart ## 10X_P7_4_AAACCTGGTTCCACGG-1 3-F-56 fibroblast Heart ## 10X_P7_4_AAACCTGTCTCGATGA-1 3-F-56 endothelial cell Heart ## 10X_P7_4_AAACGGGAGCGCTCCA-1 3-F-56 cardiac muscle cell Heart ## 10X_P7_4_AAAGCAAAGTGAATTG-1 3-F-56 fibroblast Heart ## ... ... ... ... ## 10X_P7_4_TTTGGTTCAGACTCGC-1 3-F-56 cardiac muscle cell Heart ## 10X_P7_4_TTTGTCAAGTGGAGAA-1 3-F-56 cardiac muscle cell Heart ## 10X_P7_4_TTTGTCACAAGCCATT-1 3-F-56 endocardial cell Heart ## 10X_P7_4_TTTGTCAGTCGATTGT-1 3-F-56 endothelial cell Heart ## 10X_P7_4_TTTGTCAGTGCGCTTG-1 3-F-56 fibroblast Heart ## subtissue sex ## &lt;factor&gt; &lt;factor&gt; ## 10X_P7_4_AAACCTGCACGACGAA-1 NA F ## 10X_P7_4_AAACCTGGTTCCACGG-1 NA F ## 10X_P7_4_AAACCTGTCTCGATGA-1 NA F ## 10X_P7_4_AAACGGGAGCGCTCCA-1 NA F ## 10X_P7_4_AAAGCAAAGTGAATTG-1 NA F ## ... ... ... ## 10X_P7_4_TTTGGTTCAGACTCGC-1 NA F ## 10X_P7_4_TTTGTCAAGTGGAGAA-1 NA F ## 10X_P7_4_TTTGTCACAAGCCATT-1 NA F ## 10X_P7_4_TTTGTCAGTCGATTGT-1 NA F ## 10X_P7_4_TTTGTCAGTGCGCTTG-1 NA F 14.7.3 Cell type search If one has a list of genes that you would like to check against you dataset, i.e. find the cell types that most likely represent your genes (highest expression), then scfind allows one to do that by first creating a gene index and then very quickly searching the index. The input data for indexing is the raw count matrix of the SingleCellExperiment class. By default the cell_type1 column of the colData slot of the SingleCellExperiment object is used to define cell types, however it can also be defined manually using the cell.type.label argument of the buildCellTypeIndex. heart_index &lt;- buildCellTypeIndex( tm10x_heart, cell_type_column = &quot;cell_type1&quot; ) scfind adopts a two-step compression strategy which allows efficient compression of large cell-by-gene matrix and allows fast retrieval of data by gene query. The authors estimated that one can achieve 2 orders of magnitude compression with this method. We can look at the p-values from scfind to discover the best matches. p_values &lt;- -log10(findCellType(heart_index, c(&quot;Fstl1&quot;))) barplot(p_values, ylab = &quot;-log10(pval)&quot;, las = 2) Exercise: Search for some of your favourite genes in the heart data! Exercise: Repeat this approach to build a cell type index for the thymus data. tm10x_thymus &lt;- readRDS(&quot;data/sce/Thymus_10X.rds&quot;) tm10x_thymus &lt;- tm10x_thymus[, !is.na(tm10x_thymus$cell_type1)] thymus_index &lt;- buildCellTypeIndex( tm10x_thymus, cell_type_column = &quot;cell_type1&quot; ) p_values &lt;- -log10(findCellType(thymus_index, c(&quot;Cd74&quot;))) barplot(p_values, ylab = &quot;-log10(pval)&quot;, las = 2) Exercise: Search for interesting genes in the thymus data! 14.7.4 Cell search If one is more interested in finding out in which cells all the genes from your gene list are expressed in, then you can build a cell index instead of a cell type index. buildCellIndex function should be used for building the index and findCell for searching the index: heart_gene_index &lt;- buildCellIndex(tm10x_heart) res &lt;- findCell(heart_gene_index, c(&quot;Fstl1&quot;, &quot;Ppia&quot;)) head(res$common_exprs_cells) ## cell_id cell_type ## 1 1 endothelial cell ## 2 2 fibroblast ## 3 7 fibroblast ## 4 9 endocardial cell ## 5 10 smooth muscle cell ## 6 13 smooth muscle cell barplot(-log10(res$p_values), ylab = &quot;-log10(pval)&quot;, las = 2) Cell search reports the p-values corresponding to cell types as well. Exercise: Search for some of your favourite genes in the heart data! Exercise: Repeat this approach to build a cell type index for the thymus data. Advanced exercise: Try scfind on the Deng data and see if you can find interesting results for well-known developmental genes in that mouse embryo data. 14.7.5 Explore Exercise: Play around with the scfind website, which allow for searching of several large “atlas”-type datasets. This could be a useful way to check the utility or validity of cell-type marker genes before use in your own analysis. For example, you might explore the following cardiac contractility genes in the Mouse Atlas or Tabula Muris datasets. cardiac_contractility &lt;- c(&quot;Ace2&quot;,&quot;Fkbp1b&quot;,&quot;Gh&quot;,&quot;Cacna1c&quot;,&quot;Cd59b&quot;,&quot;Ppp1r1a&quot;,&quot;Tnnt2&quot;,&quot;Nos1&quot;,&quot;Agtr1a&quot;,&quot;Camk2g&quot;,&quot;Grk2&quot;,&quot;Ins2&quot;,&quot;Dnah8&quot;,&quot;Igf1&quot;,&quot;Nos3&quot;,&quot;Nppa&quot;,&quot;Nppb&quot;,&quot;Il6&quot;,&quot;Myh6&quot;,&quot;Ren2&quot;,&quot;Tnni3&quot;,&quot;Apln&quot;,&quot;Kcnmb1&quot;,&quot;Pik3cg&quot;,&quot;Prkca&quot;,&quot;Aplnr&quot;,&quot;Slc8a1&quot;,&quot;Ace&quot;,&quot;Akt1&quot;,&quot;Edn1&quot;,&quot;Kcnmb2&quot;,&quot;Nos2&quot;,&quot;Tnf&quot;,&quot;Myh14&quot;,&quot;Adrb2&quot;,&quot;Agt&quot;,&quot;Adrb1&quot;,&quot;Atp2a2&quot;,&quot;Ryr2&quot;,&quot;Pln&quot;) 14.7.5.1 In-silico gating On the scfind website, you can perform “in-silico gating” using logical operators on marker genes to identify cell type subsets similar to the cell sorting works. To do so, you can add logical operators including “-” and &quot;*&quot; for “no” and “intermediate” expression, respectively in front of the gene name. Exercise: Try out some in silico gating on one or more datasets available at the scfind website. For example, can you gate to find regulatory T cells and effector memory T cells? 14.7.6 sessionInfo() sessionInfo() ## R version 3.6.0 (2019-04-26) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 18.04.3 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 ## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1 ## ## locale: ## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8 ## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8 ## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] parallel stats4 stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] plotly_4.9.0 scfind_1.6.0 ## [3] scmap_1.6.0 scater_1.12.2 ## [5] ggplot2_3.2.1 SingleCellExperiment_1.6.0 ## [7] SummarizedExperiment_1.14.1 DelayedArray_0.10.0 ## [9] BiocParallel_1.18.1 matrixStats_0.55.0 ## [11] Biobase_2.44.0 GenomicRanges_1.36.1 ## [13] GenomeInfoDb_1.20.0 IRanges_2.18.3 ## [15] S4Vectors_0.22.1 BiocGenerics_0.30.0 ## [17] googleVis_0.6.4 ## ## loaded via a namespace (and not attached): ## [1] httr_1.4.1 viridis_0.5.1 ## [3] tidyr_1.0.0 BiocSingular_1.0.0 ## [5] jsonlite_1.6 viridisLite_0.3.0 ## [7] DelayedMatrixStats_1.6.1 assertthat_0.2.1 ## [9] highr_0.8 GenomeInfoDbData_1.2.1 ## [11] vipor_0.4.5 yaml_2.2.0 ## [13] backports_1.1.4 pillar_1.4.2 ## [15] lattice_0.20-38 glue_1.3.1 ## [17] digest_0.6.21 XVector_0.24.0 ## [19] randomForest_4.6-14 colorspace_1.4-1 ## [21] htmltools_0.3.6 Matrix_1.2-17 ## [23] plyr_1.8.4 pkgconfig_2.0.3 ## [25] bookdown_0.13 zlibbioc_1.30.0 ## [27] purrr_0.3.2 scales_1.0.0 ## [29] tibble_2.1.3 proxy_0.4-23 ## [31] withr_2.1.2 lazyeval_0.2.2 ## [33] magrittr_1.5 crayon_1.3.4 ## [35] evaluate_0.14 class_7.3-15 ## [37] beeswarm_0.2.3 tools_3.6.0 ## [39] hash_2.2.6.1 data.table_1.12.2 ## [41] lifecycle_0.1.0 stringr_1.4.0 ## [43] munsell_0.5.0 irlba_2.3.3 ## [45] compiler_3.6.0 e1071_1.7-2 ## [47] rsvd_1.0.2 rlang_0.4.0 ## [49] grid_3.6.0 RCurl_1.95-4.12 ## [51] BiocNeighbors_1.2.0 htmlwidgets_1.3 ## [53] bitops_1.0-6 labeling_0.3 ## [55] rmarkdown_1.15 gtable_0.3.0 ## [57] codetools_0.2-16 reshape2_1.4.3 ## [59] R6_2.4.0 gridExtra_2.3 ## [61] knitr_1.25 dplyr_0.8.3 ## [63] zeallot_0.1.0 bit_1.1-14 ## [65] stringi_1.4.3 ggbeeswarm_0.6.0 ## [67] Rcpp_1.0.2 vctrs_0.2.0 ## [69] tidyselect_0.2.5 xfun_0.9 References "],
diff --git a/public/trajectory-inference.html b/public/trajectory-inference.html
index d40203361c8e5ef1678467877c17d8c020b5e7d8..30a41c54fb98509c9f0d171f7685cde8b4c303b9 100644
--- a/public/trajectory-inference.html
+++ b/public/trajectory-inference.html
@@ -757,7 +757,6 @@ nonlinear summary of the data. They are nonparametric, and their shape is sugges
 <a class="sourceLine" id="cb514-27" data-line-number="27"><span class="kw">heatmap</span>(heatdata, <span class="dt">Colv =</span> <span class="ot">NA</span>,</a>
 <a class="sourceLine" id="cb514-28" data-line-number="28">        <span class="dt">ColSideColors =</span> my_color[heatclus],<span class="dt">cexRow =</span> <span class="dv">1</span>,<span class="dt">cexCol =</span> <span class="dv">1</span>)</a></code></pre></div>
 <p><img src="pseudotime_files/figure-html/gam_tm_deg-1.png" width="90%" style="display: block; margin: auto;" /></p>
-<p>We will regress each gene on the pseudotime variable we have generated, using a general additive model (GAM). This allows us to detect non-linear patterns in gene expression.</p>
 </div>
 </div>
 <div id="monocle" class="section level2">
@@ -772,11 +771,10 @@ defined as a separate cell state.</p>
 <div id="monocle-2" class="section level3">
 <h3><span class="header-section-number">11.4.1</span> Monocle 2</h3>
 <p><code>Monocle 2</code> <span class="citation">(Qiu et al. <a href="#ref-Qiu2017-xq">2017</a>)</span> uses a different approach, with dimensionality reduction and ordering performed by reverse
-graph embedding (RGE), allowing it to detect branching events in an unsupervised manner. RGE, a machine-learning strategy, learns a
-‘principal graph’ to describe the single-cell dataset. RGE also learns the mapping function of data points on the trajectory back to the original high dimentional space simutaneously. In doing so, it aims to position the latent points in the lower dimension space (along the trajectory) while also ensuring their corresponding
+graph embedding (RGE), allowing it to detect branching events in an unsupervised manner. RGE, a machine-learning strategy, learns a ‘principal graph’ to describe the single-cell dataset. RGE also learns the mapping function of data points on the trajectory back to the original high dimentional space simutaneously. In doing so, it aims to position the latent points in the lower dimension space (along the trajectory) while also ensuring their corresponding
 positions in the input dimension are ‘neighbors’.</p>
-<p>There are different ways of implementing the RGE framework, <code>Monocle 2</code> uses <code>DDRTree</code>(Discriminative dimensionality reduction via learning a tree) by default.
-DDRTree learns latent points and the projection of latent points to the points in original input space, which is equivalent to “dimension reduction”. In addition, it simutanously learns ‘principal graph’ for K-means soft clustered cetroids for the latent points. Principal graph is the spanning tree of those centroids.</p>
+<p>There are different ways of implementing the RGE framework, <code>Monocle 2</code> uses <code>DDRTree</code>(Discriminative dimensionality reduction via learning a tree) by default.</p>
+<p><code>DDRTree</code> learns latent points and the projection of latent points to the points in original input space, which is equivalent to “dimension reduction”. In addition, it simutanously learns ‘principal graph’ for K-means soft clustered centroids for the latent points. Principal graph is the spanning tree of those centroids.</p>
 <p>DDRTree returns a principal tree of the centroids of cell clusters in low dimension, pseudotime is derived for individual cells by calculating geomdestic distance of their projections onto the tree from the root (user-defined or arbitrarily assigned).</p>
 <p><strong>Note</strong> Informally, a principal graph is like a principal curve which passes through the ‘middle’ of a data set but is
 allowed to have branches.</p>
@@ -834,10 +832,9 @@ allowed to have branches.</p>
 </div>
 <div id="monocle-3" class="section level3">
 <h3><span class="header-section-number">11.4.2</span> Monocle 3</h3>
-<p><a href="https://www.nature.com/articles/s41586-019-0969-x"><code>Monocle3</code></a><span class="citation">(Cao et al. <a href="#ref-Cao2019-cj">2019</a>)</span> is the updated single-cell analysis toolkit for analysing large datasets. <a href="https://cole-trapnell-lab.github.io/monocle3/docs/starting/">Monocle 3</a> is designed for use with absolute transcript counts (e.g. from UMI experiments). It first does dimension reduction with UMAP and then clusters the cells with Louvian/Leiden algorithms and merge adjacent groups into supergroup, and finaly resovles the trajectories individual cells can take during development, identifies
-the locations of branches and convergences within each supergroup.</p>
+<p><a href="https://www.nature.com/articles/s41586-019-0969-x"><code>Monocle3</code></a><span class="citation">(Cao et al. <a href="#ref-Cao2019-cj">2019</a>)</span> is the updated single-cell analysis toolkit for analysing large datasets. <a href="https://cole-trapnell-lab.github.io/monocle3/docs/starting/">Monocle 3</a> is designed for use with absolute transcript counts (e.g. from UMI experiments). It first does dimension reduction with UMAP, then it clusters the cells with Louvain/Leiden algorithms and merge adjacent groups into supergroup, and finaly resovles the trajectories individual cells can take during development within each supergroup.</p>
 <p>In short, Monocle3 uses <code>UMAP</code> to construct a initial trajectory inference and refines it with learning principal graph.</p>
-<p>It builds KNN graph in the UMAP dimensions and runs Louvain/Leiden algorithms om the KNN graph to derive communities; edges are drawn to connect communities that have more links (Partitioned Approximate Graph Abstraction (PAGA) graph). Each component of the PAGA grah is passed to the next step which is learning principal graph based on the SimplePPT algorithm. The pseudotime is calculated for individual cells by projecting the cells to their nearest point on the principal graph edge and measure geodesic distance along of principal points to the closest of their root nodes.</p>
+<p>It builds KNN graph in the UMAP dimensions and runs Louvain/Leiden algorithms on the KNN graph to derive communities; edges are drawn to connect communities that have more links (Partitioned Approximate Graph Abstraction (PAGA) graph). Each component of the PAGA graph is passed to the next step which is learning principal graph based on the SimplePPT algorithm. The pseudotime is calculated for individual cells by projecting the cells to their nearest point on the principal graph edge and measure geodestic distance along of principal points to the closest of their root nodes.</p>
 <div class="sourceCode" id="cb519"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb519-1" data-line-number="1"><span class="kw">library</span>(monocle3)</a></code></pre></div>
 <pre><code>## 
 ## Attaching package: &#39;monocle3&#39;</code></pre>
@@ -878,8 +875,9 @@ the locations of branches and convergences within each supergroup.</p>
 <a class="sourceLine" id="cb526-14" data-line-number="14"><span class="kw">plot_cells</span>(cds, <span class="dt">color_cells_by=</span><span class="st">&quot;cell_type2&quot;</span>, <span class="dt">graph_label_size =</span> <span class="dv">4</span>, <span class="dt">cell_size =</span> <span class="dv">2</span>,</a>
 <a class="sourceLine" id="cb526-15" data-line-number="15">           <span class="dt">group_label_size =</span> <span class="dv">6</span>)<span class="op">+</span><span class="st"> </span><span class="kw">scale_color_manual</span>(<span class="dt">values =</span> my_color) </a></code></pre></div>
 <p><img src="pseudotime_files/figure-html/run_monocle3-2.png" width="90%" style="display: block; margin: auto;" /></p>
-<div class="sourceCode" id="cb527"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb527-1" data-line-number="1"><span class="kw">plot_cells</span>(cds,  <span class="dt">graph_label_size =</span> <span class="dv">6</span>, <span class="dt">cell_size =</span> <span class="dv">1</span>, <span class="dt">color_cells_by=</span><span class="st">&quot;pseudotime&quot;</span>,</a>
-<a class="sourceLine" id="cb527-2" data-line-number="2">           <span class="dt">group_label_size =</span> <span class="dv">6</span>)</a></code></pre></div>
+<div class="sourceCode" id="cb527"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb527-1" data-line-number="1"><span class="kw">plot_cells</span>(cds,  <span class="dt">graph_label_size =</span> <span class="dv">6</span>, <span class="dt">cell_size =</span> <span class="dv">1</span>, </a>
+<a class="sourceLine" id="cb527-2" data-line-number="2">           <span class="dt">color_cells_by=</span><span class="st">&quot;pseudotime&quot;</span>,</a>
+<a class="sourceLine" id="cb527-3" data-line-number="3">           <span class="dt">group_label_size =</span> <span class="dv">6</span>)</a></code></pre></div>
 <pre><code>## Cells aren&#39;t colored in a way that allows them to be grouped.</code></pre>
 <p><img src="pseudotime_files/figure-html/run_monocle3-3.png" width="90%" style="display: block; margin: auto;" /></p>
 <div class="sourceCode" id="cb529"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb529-1" data-line-number="1">pdata_cds &lt;-<span class="st"> </span><span class="kw">pData</span>(cds)</a>