Introduction

In this case study, we will analyse the 10x single-cell dataset from an experiment on mouse retinal cells from the study O’Koren et al. FACS-sorted cells (Cxc3cr1 YEP+ ) from pooled neuroretinas of two groups: Control and LD (light damage) mice are sequenced to gain insight into the transcriptional changes of subretinal microglia in photoreceptor degeneration.

Download gene expression dataset

We first download the dataset from GEO via links below:

LD group: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM3612832

Control group: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM3612831

Example:

mkdir LD_cr_counts
cd LD_cr_counts
wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3612nnn/GSM3612832/suppl/GSM3612832_LD_genes.tsv.gz
wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3612nnn/GSM3612832/suppl/GSM3612832_LD_barcodes.tsv.gz
wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3612nnn/GSM3612832/suppl/GSM3612831_LD_matrix.mtx.gz

cd ..
mkdir Ctrl_cr_counts

wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3612nnn/GSM3612831/suppl/GSM3612831_ctrl_genes.tsv.gz
wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3612nnn/GSM3612831/suppl/GSM3612831_ctrl_barcodes.tsv.gz
wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3612nnn/GSM3612831/suppl/GSM3612831_ctrl_matrix.mtx.gz

Load count matrix into R

library(DropletUtils)
library(scater)
library(Matrix)

### Read in control cells 

cellbarcodes <- read.table("../case_study_data/retinal/GEO_downloads/Ctrl/GSM3612831_ctrl_barcodes.tsv.gz",stringsAsFactors = FALSE)
genenames <- read.table("../case_study_data/retinal/GEO_downloads/Ctrl/GSM3612831_ctrl_genes.tsv.gz",stringsAsFactors = FALSE)
molecules <- readMM("../case_study_data/retinal/GEO_downloads/Ctrl/GSM3612831_ctrl_matrix.mtx.gz")


head(cellbarcodes)
##                   V1
## 1 AAACCTGAGACGCAAC-1
## 2 AAACCTGCAGACGCCT-1
## 3 AAACCTGCAGTAAGAT-1
## 4 AAACCTGGTAGAGTGC-1
## 5 AAACCTGTCGATGAGG-1
## 6 AAACGGGAGATCACGG-1
rownames(molecules) <- genenames[,1]

## add sample ID prefix to the cell barcodes with  
colnames(molecules) <- paste0("ctrl_", cellbarcodes[,1])


sce_ctrl <- SingleCellExperiment(
    assays = list(counts = molecules),
    colData = data.frame("Cell" = colnames(molecules), Sample = "Ctrl"),
    rowData = data.frame("ID" = genenames$V1, "Symbol" = genenames$V2,
                       stringsAsFactors = FALSE))

sce_ctrl
## class: SingleCellExperiment 
## dim: 27998 4003 
## metadata(0):
## assays(1): counts
## rownames(27998): ENSMUSG00000051951 ENSMUSG00000089699 ...
##   ENSMUSG00000096730 ENSMUSG00000095742
## rowData names(2): ID Symbol
## colnames(4003): ctrl_AAACCTGAGACGCAAC-1 ctrl_AAACCTGCAGACGCCT-1
##   ... ctrl_TTTGTCAGTATAGGGC-1 ctrl_TTTGTCATCGCGTAGC-1
## colData names(2): Cell Sample
## reducedDimNames(0):
## spikeNames(0):
## Read in LD cells
cellbarcodes <- read.table("../case_study_data/retinal/GEO_downloads/LD/GSM3612832_LD_barcodes.tsv.gz",stringsAsFactors = FALSE)
genenames <- read.table("../case_study_data/retinal/GEO_downloads/LD/GSM3612832_LD_genes.tsv.gz",stringsAsFactors = FALSE)
molecules <- readMM("../case_study_data/retinal/GEO_downloads/LD/GSM3612832_LD_matrix.mtx.gz")


head(cellbarcodes)
##                   V1
## 1 AAACCTGAGAACAACT-1
## 2 AAACCTGAGATTACCC-1
## 3 AAACCTGAGCTTTGGT-1
## 4 AAACCTGAGTCCCACG-1
## 5 AAACCTGCACGGACAA-1
## 6 AAACCTGCAGTAAGCG-1
rownames(molecules) <- genenames[,1]
colnames(molecules) <- paste0("ld_", cellbarcodes[,1])

sce_ld <- SingleCellExperiment(
    assays = list(counts = molecules),
    colData = data.frame("Cell" = colnames(molecules), Sample = "LD"),
    rowData = data.frame("ID" = genenames$V1, "Symbol" = genenames$V2,
                       stringsAsFactors = FALSE))
sce_ld
## class: SingleCellExperiment 
## dim: 27998 6821 
## metadata(0):
## assays(1): counts
## rownames(27998): ENSMUSG00000051951 ENSMUSG00000089699 ...
##   ENSMUSG00000096730 ENSMUSG00000095742
## rowData names(2): ID Symbol
## colnames(6821): ld_AAACCTGAGAACAACT-1 ld_AAACCTGAGATTACCC-1 ...
##   ld_TTTGTCATCTAACTCT-1 ld_TTTGTCATCTAACTGG-1
## colData names(2): Cell Sample
## reducedDimNames(0):
## spikeNames(0):
## combine cells and samples
sce_cr <-  cbind(sce_ctrl, sce_ld)

We have loaded 6821 control cells and 4003 LD cells as descriped in the papar.(O’Koren et al)

Quality control on cells and genes

We next filter out low-quality cells and genes that are not ‘detectable’.

Calculate QC metrics

threads <- 20
seed <- 100
param <- MulticoreParam(workers = threads)

rownames(sce_cr)  <-  uniquifyFeatureNames(rowData(sce_cr)$ID, 
                                        rowData(sce_cr)$Symbol)

## Identify chromosome location
## from the annotation package

library(EnsDb.Mmusculus.v79)
location  <-  mapIds(EnsDb.Mmusculus.v79, keys = rowData(sce_cr)$ID,
                 column = "SEQNAME", keytype = "GENEID")
head(location)
## ENSMUSG00000051951 ENSMUSG00000089699 ENSMUSG00000102343 
##                "1"                "1"                "1" 
## ENSMUSG00000025900 ENSMUSG00000109048 ENSMUSG00000025902 
##                "1"                 NA                "1"
rowData(sce_cr)$CHR = location 

summary(location == "MT")
##    Mode   FALSE    TRUE    NA's 
## logical   26742      13    1243
#  rbg <- grepl("^Rp", rownames(sce_cr))

# sce_cr = calculateQCMetrics(sce_cr, feature_controls = 
#                          list(Mito = which(rowData(sce_cr)$CHR == "MT"),
#                               RBS = rbg))

sce_cr  <-  calculateQCMetrics(sce_cr, feature_controls = 
                           list(Mito = which(rowData(sce_cr)$CHR == "MT")))

par(mfrow=c(1,3))
hist(sce_cr$log10_total_counts, breaks=20, col="grey80",
    xlab="Log-total UMI count")
hist(sce_cr$log10_total_features_by_counts, breaks=20, col="grey80",
    xlab="Log-total number of expressed features")
hist(sce_cr$pct_counts_Mito, breaks=20, col="grey80",
    xlab="Proportion of reads in mitochondrial genes")

par(mfrow=c(1,3))
hist(sce_cr$pct_counts_endogenous, breaks=20, col="grey80",
    xlab="pct_counts_endogenous")
hist(sce_cr$pct_counts_in_top_50_features, breaks=20, col="grey80",
    xlab="pct_counts_in_top_50_features")
hist(sce_cr$pct_counts_in_top_100_features, breaks=20, col="grey80",
    xlab="pct_counts_in_top_100_features")

# The average expression of each gene
par(mfrow=c(1,1))
ave = calcAverage(sce_cr)
rowData(sce_cr)$AveCount <- sce_cr

hist(log10(ave), col="grey80")

## The top highly expression genes by average expression across cells

plotHighestExprs(sce_cr, n = 30)

Filter cells

We identify outlier cells based on metrics like total_counts, total_features_by_counts and pct_counts_Mito

## 
## filter out cells that have extreme QC metrics
#

#print(ncores)

nmads  <- 3
## Load qc_sce object
table(sce_cr$Sample)
## 
## Ctrl   LD 
## 4003 6821
head(colnames(colData(sce_cr)),10)
##  [1] "Cell"                           "Sample"                        
##  [3] "is_cell_control"                "total_features_by_counts"      
##  [5] "log10_total_features_by_counts" "total_counts"                  
##  [7] "log10_total_counts"             "pct_counts_in_top_50_features" 
##  [9] "pct_counts_in_top_100_features" "pct_counts_in_top_200_features"
dim(sce_cr)
## [1] 27998 10824
# The aim is to remove putative low-quality cells that have low library sizes, low numbers of expressed features, and high # mitochondrial proportions.

## Find the outlier cells by library size, mito_percent, feature count 

## We remove small outliers for the library size and the number of expressed features, and large outliers for the mito proportions. 

## identify outliers defined based on the media absolute deviation (MADs)
libsize.drop = isOutlier(sce_cr$total_counts, nmads=nmads, type="lower",
    log=TRUE)

feature.drop = isOutlier(sce_cr$total_features_by_counts, nmads=nmads, type="lower",
    log=TRUE)
mito.drop = isOutlier(sce_cr$pct_counts_Mito, nmads=nmads, type="higher")
#RBS.drop = isOutlier(sce_cr$pct_counts_RBS, nmads=nmads, type="higher")

## thresholds used for each QC metrics

attr(libsize.drop, "thresholds")
##    lower   higher 
## 1472.111      Inf
attr(feature.drop, "thresholds")
##    lower   higher 
## 833.6605      Inf
attr(mito.drop, "thresholds")
##    lower   higher 
##     -Inf 4.975133
#attr(RBS.drop, "thresholds")


## check how many cells will be filtered out by these criteria

keep = !(libsize.drop | feature.drop | mito.drop)

print(data.frame(ByLibSize=sum(libsize.drop), ByFeature=sum(feature.drop),
ByMito=sum(mito.drop), Remaining=sum(keep)))
##   ByLibSize ByFeature ByMito Remaining
## 1       123       185    377     10345
sce_cr$PassQC <-  keep


multiplot(
    plotColData(sce_cr, y="total_counts", x="Sample", colour_by="PassQC"),
    plotColData(sce_cr, y="total_features_by_counts", x="Sample",colour_by="PassQC"),
    plotColData(sce_cr, y="pct_counts_Mito", x="Sample",colour_by="PassQC"),
    cols=2)

# relations of QC metrics respect to each other ( in rough aggreement)
par(mfrow=c(1,2))
plot(sce_cr$total_features_by_counts, sce_cr$total_counts/1e6,
     xlab="Number of expressed genes",
     ylab="Library size (millions)")
plot(sce_cr$total_features_by_counts, sce_cr$pct_counts_Mito,
     xlab="Number of expressed genes",
     ylab="Mitochondrial proportion (%)")

## subset the dataset keeping the high-quality cells

sce_cr <- sce_cr[, keep]
table(sce_cr$Sample)
## 
## Ctrl   LD 
## 3706 6639

Doublets

We run a doublet detection algorithm scds (https://github.com/kostkalab/scds) here:

library(scds)
set.seed(100)
#- Annotate doublet using co-expression based doublet scoring:
sce_cr <- cxds(sce_cr)

#- Annotate doublet using binary classification based doublet scoring:
sce_cr <- bcds(sce_cr)

#- Combine both annotations into a hybrid annotation
sce_cr <- cxds_bcds_hybrid(sce_cr)

#- Doublet scores are now available via colData:
CD   <-  colData(sce_cr)
head(cbind(CD$cxds_score,CD$bcds_score, CD$hybrid_score))
##                               [,1]        [,2]       [,3]
## ctrl_AAACCTGAGACGCAAC-1 12865.0887 0.774427712 1.30928814
## ctrl_AAACCTGCAGACGCCT-1  8986.7212 0.024173055 0.39687540
## ctrl_AAACCTGGTAGAGTGC-1   304.1573 0.001523374 0.01394471
## ctrl_AAACCTGTCGATGAGG-1  9536.4905 0.091702789 0.48732718
## ctrl_AAACGGGAGATCACGG-1  2200.0600 0.002343185 0.09342774
## ctrl_AAACGGGAGCGATCCC-1  4409.5489 0.002034686 0.18479142
plotColData(
    sce_cr,
    x = "Sample",
    y = "total_features_by_counts",
    colour = "hybrid_score"
)

Filter out potential doublets which are cells that have higher hybrid_score.

db.drop  <-  isOutlier(sce_cr$hybrid_score, nmads=5, type="higher")

sum(db.drop)
## [1] 606
sce_cr <- sce_cr[,!db.drop]
table(sce_cr$Sample)
## 
## Ctrl   LD 
## 3569 6170
#saveRDS(sce_cr,file="sce_cr_filtered.rds")

Normalisation of gene expression

We next normalise the gene expression for each gene across all cells using sctransform.

#sce_cr <- readRDS(file = "sce_cr_filtered.rds")
set.seed(100)

umi <- assay(sce_cr,"counts")

colnames(umi) <- sce_cr$Cell
cell_attr  <-  colData(sce_cr)

rownames(cell_attr) <- cell_attr$Cell

# future::plan(strategy = 'multicore', workers = 8)
# options(future.globals.maxSize = 10 * 1024 ^ 3)

### Genes expressed in at least 5 cells will be kept
normalized_data <- sctransform::vst(umi = umi, min_cells = 5,
                                    cell_attr = cell_attr,
                                    latent_var = "log10_total_counts",
                                    show_progress = FALSE)

remain_genes  <-  intersect(rownames(umi),rownames(normalized_data$y))

sce_cr <- sce_cr[remain_genes,]
#colnames(sce_cr) <- sce_cr$Barcode_S

## the model we supplied in running sctransform:

normalized_data$model_str
## [1] "y ~ log10_total_counts"

We add the result to the SCE object as the ‘sctrans_norm’ assay.

dim(sce_cr)
## [1] 15045  9739
assay(sce_cr, "sctrans_norm") <- normalized_data$y[rownames(sce_cr),colnames(sce_cr)]


#sce_cr$log10_total_counts
##   Matrix of estimated model parameters per gene (theta and regression coefficients)

sctransform::plot_model_pars(normalized_data)

Normalised gene expression with stablised variance:

## Further, after transformation there is no relationship between gene mean and variance, as the next plot shows.
#c('Malat1', 'Rpl10', 'Cd74')
sctransform::plot_model(normalized_data, umi, c("Rpl10","Cd74","Malat1"), 
                        plot_residual = TRUE,cell_attr = cell_attr)

Plot the top highly variance genes:

gene_attr <- normalized_data$gene_attr
gene_attr$gene_name <- rownames(gene_attr)

ggplot(gene_attr, aes(log10(gmean), residual_variance)) + geom_point(alpha=0.3, shape=16) +
  geom_density_2d(size = 0.3)+geom_text(data = gene_attr[gene_attr$variance>100,],
                                        mapping = aes(x = log10(gmean),
                                                      y = residual_variance,
                                                      label = gene_name ))+
  theme_classic()

## check size factor?
## summary(sizeFactors(sce_cr))

PCA with hvgs

We next run PCA using highly variable genes selected based on the residual variance of the sctransform normalised gene expression matrix.

set.seed(100)

## Number of hvgs to use
num_hvg <- 3000

## how many PCs to retain after PCA
num_pc <- 10
## order genes by residual variance and select the top 3000 genes
hvgs <- head(round(normalized_data$gene_attr[order(-normalized_data$gene_attr$residual_variance), ], 2), num_hvg)

hvgs <- rownames(hvgs)

param <- BiocParallel::MulticoreParam(workers = 10)

sce_cr <-  scater::runPCA(sce_cr, ncomponents = num_pc, exprs_values = "sctrans_norm", 
                          feature_set = hvgs, BPPARAM = param)

#reducedDim(sce_cr, "PCA") = sce_pca$x
#head(new_sce_pcs)
# number of selected PCs

ncol(reducedDim(sce_cr, "PCA"))
## [1] 10
## scree plot of PCs
plot(attr(reducedDim(sce_cr,"PCA"),"percentVar")*100,ylab = "Percentage of Variance explained")

set.seed(100)
sce_cr <- runUMAP(sce_cr, use_dimred = "PCA")
plotUMAP(sce_cr)

#sce_cr <- runTSNE(sce_cr,use_dimred = "PCA")
#sce_cr <- readRDS("sce_retina.rds")

Clustering

We build SNN graph and then cluster with louvain algorithm (or walktrap).

library(pheatmap)
library(scater)
library(scran)
library(Polychrome)

set.seed(100) # for reproducibility
## generate random colors using function from Polychrome
snn.gr <- buildSNNGraph(sce_cr, use.dimred="PCA", k = 80, BPPARAM = param)
clusters <- igraph::cluster_walktrap(snn.gr)
table(clusters$membership)
## 
##    1    2    3    4    5    6    7    8    9   10   11 
##  757  408 1378 3378  757 1965   85  218  305  105  383
sce_cr$Cluster <- factor(clusters$membership)

table(sce_cr$Cluster)
## 
##    1    2    3    4    5    6    7    8    9   10   11 
##  757  408 1378 3378  757 1965   85  218  305  105  383

Generate color palette for our clusters :

set.seed(7654321) 
cluster_colors <- createPalette(length(table(sce_cr$Cluster)), 
                          c("#010101", "#ff0000"), 
                          M=1000)
names(cluster_colors) <- unique(as.character(sce_cr$Cluster))
swatch(cluster_colors)

Plot the UMAP dims with cluster-colored cells

########## Function for generating labels on reduced dims plot

get_centroid <- function(dims, cluster){
  ## x is a vector containing the first dim
  ## y is a vector containing the second dim
  ## cluster is a factor containing the cluster label
  pol = as.matrix(dims)
  clusters = unique(cluster)
  cets = sapply(clusters,function(cl){
    r = colMeans(dims[cluster %in% cl,])
    names(r) =c("x","y")
    r
  },USE.NAMES = TRUE)
  colnames(cets) = clusters
  return(cets)
}

#############

plt_df_umap <- data.frame(umap1 = reducedDim(sce_cr,"UMAP")[,1],
                          umap2 = reducedDim(sce_cr,"UMAP")[,2])

plt_df_umap <- cbind(plt_df_umap,data.frame(colData(sce_cr)))

label_centroid  <-  get_centroid(reducedDim(sce_cr,"UMAP"), cluster = sce_cr$Cluster)

text_df  <- data.frame(x = label_centroid["x",],
                     y = label_centroid["y",],
                     text = colnames(label_centroid))


ggplot(data = plt_df_umap)+geom_point(mapping = aes(x = umap1,
                                                    y = umap2,
                                                    colour = Cluster),
                                      size = 0.6)+
  scale_color_manual(values = cluster_colors)+theme_classic()+geom_text(data = text_df,
                                              mapping = aes(x = x,y =y,
                                                            label =text))+
  guides(colour = guide_legend(override.aes = list(size=6)))

Plot sample-colored cells in UMAP dims

ggplot(data = plt_df_umap)+geom_point(mapping = aes(x = umap1,
                                                    y = umap2,
                                                    colour = Sample),size = 0.6,
                                      alpha=0.7)+
  scale_color_manual(values = list(LD="red", Ctrl="grey"))+theme_classic()

A little extra:

library(clustree)
clusters <- igraph::cluster_louvain(snn.gr)
sce_cr$result_1 <- factor(clusters$membership)

clusters <- igraph::cluster_walktrap(snn.gr)
sce_cr$result_2 <- factor(clusters$membership)

clustree(data.frame(colData(sce_cr)), prefix = "result_")

ggplot(data = plt_df_umap)+geom_point(mapping = aes(x = umap1,
                                                    y = umap2,
                                                    colour = Cluster),
                                      size = 0.6,alpha = 0.5)+
  scale_color_manual(values = cluster_colors)+theme_classic()+geom_text(data = text_df,
                                              mapping = aes(x = x,y =y,
                                                            label =text))+
  guides(colour = guide_legend(override.aes = list(size=6)))+facet_wrap(.~Sample)

gplt <- colData(sce_cr)
ggplot(data = data.frame(gplt))+geom_bar(mapping = aes(x = Cluster, fill = Sample),
                                         position = "fill")+scale_fill_manual(values = list(LD="red", Ctrl="grey"))

ggplot(data = data.frame(gplt))+geom_bar(mapping = aes(x = Cluster, fill = Sample)
                                         )+scale_fill_manual(values = list(LD="red", Ctrl="grey"))

Marker genes for each cluster

We find the marker genes for each cluster by doing a pairwiseTTtest for each cluster vs each of the remaining clusters and combine the results into one list.

markers_cl <- findMarkers(sce_cr, cluster = sce_cr$Cluster,
                          direction = "up",pval.type = "any", assay.type = "sctrans_norm")
num_genes <- 8
chosen = sapply(c(1:length(markers_cl)), function(clusterID){
  marker.set = markers_cl[[clusterID]]
  chosen = rownames(head(marker.set, ))
  #data.frame('chosen' = as.character(chosen),
  #           'clusterID' = clusterID)
  chosen
})

plotHeatmap(sce_cr, features= unique(as.character(chosen)), 
            exprs_values="sctrans_norm", zlim=5, center=TRUE, symmetric=TRUE,
            cluster_cols=FALSE, colour_columns_by="Cluster", 
            columns=order(sce_cr$Cluster),show_colnames=FALSE, fontsize_row = 8)

We use Microglia markers: P2ry12, Siglech, Sparc to identify microglia clusters. Microglia cells with high expression of microglia markers and low expression of myeloid markers.

Look for - Clusters mainly from Control, mostly from LD - Replicating MG : MKI67 (Marker Of Proliferation Ki-67) - Microglia Clusters large/small - monocyte-derived macrophages (mo-MFs),perivascular (pv) macrophages

Clusters of mo-MFs and perivascular (pv) macrophages were enriched for myelomonocytic genes, including Ccr2, Mrc1, Lyz2, and Cd163

plotExpression(sce_cr, x = "Cluster", features = c("P2ry12", "Siglech", "Sparc"),
               exprs_values = "sctrans_norm")

multiplot(plotExpression(sce_cr, x = "Cluster", features = c( "H2-Aa"),
               exprs_values = "sctrans_norm"),
          plotExpression(sce_cr, x = "Cluster", features = c( "Tmem119"),
               exprs_values = "sctrans_norm"),
          plotExpression(sce_cr, x = "Cluster", features = c( "Lgals3"),
               exprs_values = "sctrans_norm"))

multiplot(plotExpression(sce_cr, x = "Cluster", features = c( "Lpl"),
               exprs_values = "sctrans_norm"),
          plotExpression(sce_cr, x = "Cluster", features = c( "Spp1"),
               exprs_values = "sctrans_norm"))

plotExpression(sce_cr, x = "Cluster", features = c("Ccr2", "Mrc1", "Lyz2","Mki67"),
               exprs_values = "sctrans_norm")

plotHeatmap(sce_cr, features= c("P2ry12", "Siglech", "Sparc","Lag3","Ccr2", "Cd68","Mrc1", "Lyz2","Mki67"), 
            exprs_values="counts", zlim=5, center=TRUE, symmetric=TRUE,
            cluster_cols=FALSE, colour_columns_by="Cluster", colorScale = cluster_colors,
            columns=order(sce_cr$Cluster),show_colnames=FALSE, fontsize_row = 8)

#saveRDS(normalized_data$y, file = "norm_y.rds")
#saveRDS(sce_cr, file = "sce_retina.rds")

Look for srMG

We next look for the cluster corresponding to srMG (subretinal microglia) in LD. Prior knowledge assumed as described in the original paper:

1, srMG populations are fewer relative to other populations in LD, so we focus on small clusters 2, down-regulated homeostatic genes, including Tmem119, P2ry12, and Siglech,a phenotype reported in other CNS diseases

Find which cluster is more likely to be corresponding to cluster sMG3 (srMG) in the original paper.

subsce <- sce_cr[,sce_cr$Cluster %in% c(7,11)]
subsce$Cluster <- as.character(subsce$Cluster)
markers_srmg <- findMarkers(subsce, clusters = subsce$Cluster, 
                         assay.type = "sctrans_norm")
markers_srmg$`7`
## DataFrame with 15045 rows and 4 columns
##               Top              p.value                  FDR
##         <integer>            <numeric>            <numeric>
## Tuba1b          1 3.79378381681197e-37 5.70774775239358e-33
## H2afz           2 4.05197086962926e-35 3.04809508667863e-31
## Rgs10           3 1.13466482534566e-33 5.69034409910847e-30
## Fcgr3           4 5.23166805044113e-32 1.96776114547217e-28
## Itm2b           5 1.25564202818163e-30 3.77822686279851e-27
## ...           ...                  ...                  ...
## Tmem67      15041    0.999208402264314    0.999474131511642
## Dpm1        15042    0.999375730377185    0.999517203254054
## Ptcra       15043    0.999384332904669    0.999517203254054
## Ahr         15044    0.999615586688951    0.999682032819414
## Gm12728     15045    0.999750291570248    0.999750291570248
##                      logFC.11
##                     <numeric>
## Tuba1b       6.19146748938722
## H2afz        6.98254668196525
## Rgs10      -0.916348533582088
## Fcgr3       -1.19648062513746
## Itm2b       -1.61019794421386
## ...                       ...
## Tmem67    -0.0001014971960209
## Dpm1      9.0454421426682e-05
## Ptcra    7.72762192773092e-05
## Ahr     -2.74498672339779e-05
## Gm12728 -2.68342016308196e-05
r_plt <- data.frame(markers_srmg$`7`)
r_plt$genes <- rownames(r_plt)

ggplot(data = r_plt)+geom_point(mapping = aes(x = logFC.11, y = -log10(FDR)))+
  geom_text(data = r_plt[abs(r_plt$logFC.11)>1.5 & -log10(r_plt$FDR) > 10 ,],
            mapping = aes(
              x = logFC.11, y = -log10(FDR), label = genes))+
  theme_classic()+xlim(-3,3)
## Warning: Removed 70 rows containing missing values (geom_point).
## Warning: Removed 30 rows containing missing values (geom_text).

Refer to Figure 4E in the original paper, the down/up-regulated genes in cluster 9 matching sMG3 well.

rownames(r_plt[r_plt$logFC.11> 2 & -log10(r_plt$FDR) > 5 ,])
##   [1] "Tuba1b"        "H2afz"         "Tubb5"         "Stmn1"        
##   [5] "Ptma"          "Ran"           "Hmgn1"         "Hmgb2"        
##   [9] "Txn1"          "Tagln2"        "Anp32b"        "Cks1b"        
##  [13] "Smc2"          "Calm1"         "2810417H13Rik" "Ranbp1"       
##  [17] "Smc4"          "Dek"           "Cox5a"         "Tk1"          
##  [21] "Gapdh"         "Erh"           "Hmgn2"         "Snrpd1"       
##  [25] "Cenpw"         "Prdx1"         "Birc5"         "Gmnn"         
##  [29] "2700094K13Rik" "Pcna"          "H2afv"         "Tmsb10"       
##  [33] "Rrm1"          "Dut"           "Asf1b"         "Tmsb4x"       
##  [37] "Cdca8"         "Mki67"         "Dnajc9"        "Mcm6"         
##  [41] "Tyms"          "Ms4a6c"        "Racgap1"       "Gins2"        
##  [45] "Mcm5"          "Pgam1"         "Mcm3"          "Lgals1"       
##  [49] "Lig1"          "H2afx"         "Cenpm"         "Spc24"        
##  [53] "Ezh2"          "Nasp"          "Ldha"          "Rfc5"         
##  [57] "Tcf19"         "Syce2"         "Cdk1"          "Rrm2"         
##  [61] "Ccna2"         "Tpx2"          "Fen1"          "Tubb4b"       
##  [65] "Apitd1"        "Tacc3"         "Rad51ap1"      "Uhrf1"        
##  [69] "Rad51"         "Chaf1b"        "Hells"         "Vim"          
##  [73] "Tipin"         "Top2a"         "Mcm2"          "Mcm7"         
##  [77] "Mif"           "Cbx5"          "Ung"           "Cdca3"        
##  [81] "Csrp1"         "Slc43a3"       "Prim1"         "Dhfr"         
##  [85] "Pbk"           "Knstrn"        "Ccnb1"         "Cdkn3"        
##  [89] "Cenpa"         "Cks2"          "Ccnb2"         "Tgfbi"        
##  [93] "Fignl1"        "Hmmr"          "Clspn"         "Crip1"        
##  [97] "Cenpe"         "Pkmyt1"        "Hmgb3"         "Atad2"        
## [101] "Dtl"           "Ccne1"         "Chaf1a"        "Cdc45"        
## [105] "Cep55"
rownames(r_plt[r_plt$logFC.11<(-2) & -log10(r_plt$FDR) > 5,])[1:10]
##  [1] "Ctsd" "Ctsb" "Cst7" NA     NA     NA     NA     NA     NA     NA

Practise Run this DE analysis for other small microglia cluster from LD group.

Monocle 2 trajectory inference

We next run monocle2 as described in the paper, as they stated that the srMG cluster is expected in the later development stages because it is particularly distinct from the steady-state (Cluster 1/2 from control) (As shown in Figure 5A). And

library(monocle)
library(dplyr)

lyz_cluster <- assay(sce_cr,"sctrans_norm")["Lyz2",]
lyz_cluster <- reshape2::melt(lyz_cluster)
lyz_cluster$cell <- rownames(lyz_cluster)
lyz_cluster$Cluster <- sce_cr$Cluster

lyz_cluster <- lyz_cluster %>% group_by(Cluster) %>% summarize(mean_lyz2 = mean(value))
lyz_cluster <- lyz_cluster$Cluster[lyz_cluster$mean_lyz2 > quantile(lyz_cluster$mean_lyz2,.75 )]

rep_MG <- assay(sce_cr,"sctrans_norm")["Mki67",]
rep_MG <- reshape2::melt(rep_MG)
rep_MG$cell <- rownames(rep_MG)
rep_MG$Cluster <- sce_cr$Cluster

rep_MG <- rep_MG %>% group_by(Cluster) %>% summarize(mean_mki67 = mean(value))
rep_MG <- rep_MG$Cluster[rep_MG$mean_mki67 > quantile(rep_MG$mean_mki67,.85 )]

d <- sce_cr[which(rownames(sce_cr) %in% hvgs[1:1000]), which(! sce_cr$Cluster %in% c(rep_MG,lyz_cluster))]
d <- d[!duplicated(rownames(d)), ]

colnames(d) <- 1:ncol(d)
geneNames <- rownames(d)
rownames(d) <- 1:nrow(d)
pd <- data.frame(colData(d) )
pd <- new("AnnotatedDataFrame", data=pd)
fd <- data.frame(gene_short_name = geneNames)
fd <- new("AnnotatedDataFrame", data=fd)

dCellData <- newCellDataSet(counts(d), phenoData = pd, featureData = fd)
#
dCellData <- setOrderingFilter(dCellData, which(geneNames %in% hvgs[1:1000]))
dCellData <- estimateSizeFactors(dCellData)
dCellDataSet <- reduceDimension(dCellData,reduction_method = "DDRTree", pseudo_expr = 1)
dCellDataSet <- orderCells(dCellDataSet, reverse = FALSE)
plot_cell_trajectory(dCellDataSet,color_by = "Pseudotime")

plot_cell_trajectory(dCellDataSet,color_by = "Sample")+scale_color_manual(values = list(Ctrl =  "grey",
                                                                                        LD = "red"))+facet_wrap(.~Cluster)

plot_cell_trajectory(dCellDataSet,color_by = "Cluster")+
  scale_color_manual(values = cluster_colors)+facet_wrap(.~Sample)+
  guides(colour = guide_legend(override.aes = list(size=6)))

plot_cell_trajectory(dCellDataSet,color_by = "State")+facet_wrap(.~Sample)

d <- sce_cr[which(rownames(sce_cr) %in% hvgs[1:1000]), which(sce_cr$Cluster %in% c(11,8,7,13,14))]
d <- d[!duplicated(rownames(d)), ]

colnames(d) <- 1:ncol(d)
geneNames <- rownames(d)
rownames(d) <- 1:nrow(d)
pd <- data.frame(colData(d) )
pd <- new("AnnotatedDataFrame", data=pd)
fd <- data.frame(gene_short_name = geneNames)
fd <- new("AnnotatedDataFrame", data=fd)

dCellData <- newCellDataSet(counts(d), phenoData = pd, featureData = fd)
#
dCellData <- setOrderingFilter(dCellData, which(geneNames %in% hvgs[1:1000]))
dCellData <- estimateSizeFactors(dCellData)
dCellDataSet <- reduceDimension(dCellData,reduction_method = "DDRTree", pseudo_expr = 1)
dCellDataSet <- orderCells(dCellDataSet, reverse = FALSE)

#dCellDataSet <- orderCells(dCellDataSet, reverse = FALSE, root_state = 11)
plot_cell_trajectory(dCellDataSet,color_by = "Cluster")+scale_color_manual(values = cluster_colors)+
    guides(colour = guide_legend(override.aes = list(size=6)))

plot_cell_trajectory(dCellDataSet,color_by = "Pseudotime")

plot_cell_trajectory(dCellDataSet,color_by = "State")

# Store the ordering
pseudotime_monocle2 <-
    data.frame(
        Cluster = phenoData(dCellDataSet)$Cluster,
        pseudotime = phenoData(dCellDataSet)$Pseudotime,
        State = phenoData(dCellDataSet)$State
    )
ggplot(data = pseudotime_monocle2)+geom_boxplot(mapping = aes(x = Cluster,
                                                              y = pseudotime))