In this case study, we will analyse the 10x single-cell dataset from an experiment on mouse retinal cells from the study O’Koren et al. FACS-sorted cells (Cxc3cr1 YEP+ ) from pooled neuroretinas of two groups: Control and LD (light damage) mice are sequenced to gain insight into the transcriptional changes of subretinal microglia in photoreceptor degeneration.
We first download the dataset from GEO via links below:
LD group: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM3612832
Control group: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM3612831
Example:
mkdir LD_cr_counts
cd LD_cr_counts
wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3612nnn/GSM3612832/suppl/GSM3612832_LD_genes.tsv.gz
wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3612nnn/GSM3612832/suppl/GSM3612832_LD_barcodes.tsv.gz
wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3612nnn/GSM3612832/suppl/GSM3612831_LD_matrix.mtx.gz
cd ..
mkdir Ctrl_cr_counts
wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3612nnn/GSM3612831/suppl/GSM3612831_ctrl_genes.tsv.gz
wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3612nnn/GSM3612831/suppl/GSM3612831_ctrl_barcodes.tsv.gz
wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3612nnn/GSM3612831/suppl/GSM3612831_ctrl_matrix.mtx.gz
library(DropletUtils)
library(scater)
library(Matrix)
### Read in control cells
cellbarcodes <- read.table("../case_study_data/retinal/GEO_downloads/Ctrl/GSM3612831_ctrl_barcodes.tsv.gz",stringsAsFactors = FALSE)
genenames <- read.table("../case_study_data/retinal/GEO_downloads/Ctrl/GSM3612831_ctrl_genes.tsv.gz",stringsAsFactors = FALSE)
molecules <- readMM("../case_study_data/retinal/GEO_downloads/Ctrl/GSM3612831_ctrl_matrix.mtx.gz")
head(cellbarcodes)
## V1
## 1 AAACCTGAGACGCAAC-1
## 2 AAACCTGCAGACGCCT-1
## 3 AAACCTGCAGTAAGAT-1
## 4 AAACCTGGTAGAGTGC-1
## 5 AAACCTGTCGATGAGG-1
## 6 AAACGGGAGATCACGG-1
rownames(molecules) <- genenames[,1]
## add sample ID prefix to the cell barcodes with
colnames(molecules) <- paste0("ctrl_", cellbarcodes[,1])
sce_ctrl <- SingleCellExperiment(
assays = list(counts = molecules),
colData = data.frame("Cell" = colnames(molecules), Sample = "Ctrl"),
rowData = data.frame("ID" = genenames$V1, "Symbol" = genenames$V2,
stringsAsFactors = FALSE))
sce_ctrl
## class: SingleCellExperiment
## dim: 27998 4003
## metadata(0):
## assays(1): counts
## rownames(27998): ENSMUSG00000051951 ENSMUSG00000089699 ...
## ENSMUSG00000096730 ENSMUSG00000095742
## rowData names(2): ID Symbol
## colnames(4003): ctrl_AAACCTGAGACGCAAC-1 ctrl_AAACCTGCAGACGCCT-1
## ... ctrl_TTTGTCAGTATAGGGC-1 ctrl_TTTGTCATCGCGTAGC-1
## colData names(2): Cell Sample
## reducedDimNames(0):
## spikeNames(0):
## Read in LD cells
cellbarcodes <- read.table("../case_study_data/retinal/GEO_downloads/LD/GSM3612832_LD_barcodes.tsv.gz",stringsAsFactors = FALSE)
genenames <- read.table("../case_study_data/retinal/GEO_downloads/LD/GSM3612832_LD_genes.tsv.gz",stringsAsFactors = FALSE)
molecules <- readMM("../case_study_data/retinal/GEO_downloads/LD/GSM3612832_LD_matrix.mtx.gz")
head(cellbarcodes)
## V1
## 1 AAACCTGAGAACAACT-1
## 2 AAACCTGAGATTACCC-1
## 3 AAACCTGAGCTTTGGT-1
## 4 AAACCTGAGTCCCACG-1
## 5 AAACCTGCACGGACAA-1
## 6 AAACCTGCAGTAAGCG-1
rownames(molecules) <- genenames[,1]
colnames(molecules) <- paste0("ld_", cellbarcodes[,1])
sce_ld <- SingleCellExperiment(
assays = list(counts = molecules),
colData = data.frame("Cell" = colnames(molecules), Sample = "LD"),
rowData = data.frame("ID" = genenames$V1, "Symbol" = genenames$V2,
stringsAsFactors = FALSE))
sce_ld
## class: SingleCellExperiment
## dim: 27998 6821
## metadata(0):
## assays(1): counts
## rownames(27998): ENSMUSG00000051951 ENSMUSG00000089699 ...
## ENSMUSG00000096730 ENSMUSG00000095742
## rowData names(2): ID Symbol
## colnames(6821): ld_AAACCTGAGAACAACT-1 ld_AAACCTGAGATTACCC-1 ...
## ld_TTTGTCATCTAACTCT-1 ld_TTTGTCATCTAACTGG-1
## colData names(2): Cell Sample
## reducedDimNames(0):
## spikeNames(0):
## combine cells and samples
sce_cr <- cbind(sce_ctrl, sce_ld)
We have loaded 6821 control cells and 4003 LD cells as descriped in the papar.(O’Koren et al)
We next filter out low-quality cells and genes that are not ‘detectable’.
threads <- 20
seed <- 100
param <- MulticoreParam(workers = threads)
rownames(sce_cr) <- uniquifyFeatureNames(rowData(sce_cr)$ID,
rowData(sce_cr)$Symbol)
## Identify chromosome location
## from the annotation package
library(EnsDb.Mmusculus.v79)
location <- mapIds(EnsDb.Mmusculus.v79, keys = rowData(sce_cr)$ID,
column = "SEQNAME", keytype = "GENEID")
head(location)
## ENSMUSG00000051951 ENSMUSG00000089699 ENSMUSG00000102343
## "1" "1" "1"
## ENSMUSG00000025900 ENSMUSG00000109048 ENSMUSG00000025902
## "1" NA "1"
rowData(sce_cr)$CHR = location
summary(location == "MT")
## Mode FALSE TRUE NA's
## logical 26742 13 1243
# rbg <- grepl("^Rp", rownames(sce_cr))
# sce_cr = calculateQCMetrics(sce_cr, feature_controls =
# list(Mito = which(rowData(sce_cr)$CHR == "MT"),
# RBS = rbg))
sce_cr <- calculateQCMetrics(sce_cr, feature_controls =
list(Mito = which(rowData(sce_cr)$CHR == "MT")))
par(mfrow=c(1,3))
hist(sce_cr$log10_total_counts, breaks=20, col="grey80",
xlab="Log-total UMI count")
hist(sce_cr$log10_total_features_by_counts, breaks=20, col="grey80",
xlab="Log-total number of expressed features")
hist(sce_cr$pct_counts_Mito, breaks=20, col="grey80",
xlab="Proportion of reads in mitochondrial genes")
par(mfrow=c(1,3))
hist(sce_cr$pct_counts_endogenous, breaks=20, col="grey80",
xlab="pct_counts_endogenous")
hist(sce_cr$pct_counts_in_top_50_features, breaks=20, col="grey80",
xlab="pct_counts_in_top_50_features")
hist(sce_cr$pct_counts_in_top_100_features, breaks=20, col="grey80",
xlab="pct_counts_in_top_100_features")
# The average expression of each gene
par(mfrow=c(1,1))
ave = calcAverage(sce_cr)
rowData(sce_cr)$AveCount <- sce_cr
hist(log10(ave), col="grey80")
## The top highly expression genes by average expression across cells
plotHighestExprs(sce_cr, n = 30)
We identify outlier cells based on metrics like total_counts
, total_features_by_counts
and pct_counts_Mito
##
## filter out cells that have extreme QC metrics
#
#print(ncores)
nmads <- 3
## Load qc_sce object
table(sce_cr$Sample)
##
## Ctrl LD
## 4003 6821
head(colnames(colData(sce_cr)),10)
## [1] "Cell" "Sample"
## [3] "is_cell_control" "total_features_by_counts"
## [5] "log10_total_features_by_counts" "total_counts"
## [7] "log10_total_counts" "pct_counts_in_top_50_features"
## [9] "pct_counts_in_top_100_features" "pct_counts_in_top_200_features"
dim(sce_cr)
## [1] 27998 10824
# The aim is to remove putative low-quality cells that have low library sizes, low numbers of expressed features, and high # mitochondrial proportions.
## Find the outlier cells by library size, mito_percent, feature count
## We remove small outliers for the library size and the number of expressed features, and large outliers for the mito proportions.
## identify outliers defined based on the media absolute deviation (MADs)
libsize.drop = isOutlier(sce_cr$total_counts, nmads=nmads, type="lower",
log=TRUE)
feature.drop = isOutlier(sce_cr$total_features_by_counts, nmads=nmads, type="lower",
log=TRUE)
mito.drop = isOutlier(sce_cr$pct_counts_Mito, nmads=nmads, type="higher")
#RBS.drop = isOutlier(sce_cr$pct_counts_RBS, nmads=nmads, type="higher")
## thresholds used for each QC metrics
attr(libsize.drop, "thresholds")
## lower higher
## 1472.111 Inf
attr(feature.drop, "thresholds")
## lower higher
## 833.6605 Inf
attr(mito.drop, "thresholds")
## lower higher
## -Inf 4.975133
#attr(RBS.drop, "thresholds")
## check how many cells will be filtered out by these criteria
keep = !(libsize.drop | feature.drop | mito.drop)
print(data.frame(ByLibSize=sum(libsize.drop), ByFeature=sum(feature.drop),
ByMito=sum(mito.drop), Remaining=sum(keep)))
## ByLibSize ByFeature ByMito Remaining
## 1 123 185 377 10345
sce_cr$PassQC <- keep
multiplot(
plotColData(sce_cr, y="total_counts", x="Sample", colour_by="PassQC"),
plotColData(sce_cr, y="total_features_by_counts", x="Sample",colour_by="PassQC"),
plotColData(sce_cr, y="pct_counts_Mito", x="Sample",colour_by="PassQC"),
cols=2)
# relations of QC metrics respect to each other ( in rough aggreement)
par(mfrow=c(1,2))
plot(sce_cr$total_features_by_counts, sce_cr$total_counts/1e6,
xlab="Number of expressed genes",
ylab="Library size (millions)")
plot(sce_cr$total_features_by_counts, sce_cr$pct_counts_Mito,
xlab="Number of expressed genes",
ylab="Mitochondrial proportion (%)")
## subset the dataset keeping the high-quality cells
sce_cr <- sce_cr[, keep]
table(sce_cr$Sample)
##
## Ctrl LD
## 3706 6639
We run a doublet
detection algorithm scds
(https://github.com/kostkalab/scds) here:
library(scds)
set.seed(100)
#- Annotate doublet using co-expression based doublet scoring:
sce_cr <- cxds(sce_cr)
#- Annotate doublet using binary classification based doublet scoring:
sce_cr <- bcds(sce_cr)
#- Combine both annotations into a hybrid annotation
sce_cr <- cxds_bcds_hybrid(sce_cr)
#- Doublet scores are now available via colData:
CD <- colData(sce_cr)
head(cbind(CD$cxds_score,CD$bcds_score, CD$hybrid_score))
## [,1] [,2] [,3]
## ctrl_AAACCTGAGACGCAAC-1 12865.0887 0.774427712 1.30928814
## ctrl_AAACCTGCAGACGCCT-1 8986.7212 0.024173055 0.39687540
## ctrl_AAACCTGGTAGAGTGC-1 304.1573 0.001523374 0.01394471
## ctrl_AAACCTGTCGATGAGG-1 9536.4905 0.091702789 0.48732718
## ctrl_AAACGGGAGATCACGG-1 2200.0600 0.002343185 0.09342774
## ctrl_AAACGGGAGCGATCCC-1 4409.5489 0.002034686 0.18479142
plotColData(
sce_cr,
x = "Sample",
y = "total_features_by_counts",
colour = "hybrid_score"
)
Filter out potential doublets which are cells that have higher hybrid_score
.
db.drop <- isOutlier(sce_cr$hybrid_score, nmads=5, type="higher")
sum(db.drop)
## [1] 606
sce_cr <- sce_cr[,!db.drop]
table(sce_cr$Sample)
##
## Ctrl LD
## 3569 6170
#saveRDS(sce_cr,file="sce_cr_filtered.rds")
We next normalise the gene expression for each gene across all cells using sctransform
.
#sce_cr <- readRDS(file = "sce_cr_filtered.rds")
set.seed(100)
umi <- assay(sce_cr,"counts")
colnames(umi) <- sce_cr$Cell
cell_attr <- colData(sce_cr)
rownames(cell_attr) <- cell_attr$Cell
# future::plan(strategy = 'multicore', workers = 8)
# options(future.globals.maxSize = 10 * 1024 ^ 3)
### Genes expressed in at least 5 cells will be kept
normalized_data <- sctransform::vst(umi = umi, min_cells = 5,
cell_attr = cell_attr,
latent_var = "log10_total_counts",
show_progress = FALSE)
remain_genes <- intersect(rownames(umi),rownames(normalized_data$y))
sce_cr <- sce_cr[remain_genes,]
#colnames(sce_cr) <- sce_cr$Barcode_S
## the model we supplied in running sctransform:
normalized_data$model_str
## [1] "y ~ log10_total_counts"
We add the result to the SCE
object as the ‘sctrans_norm’ assay.
dim(sce_cr)
## [1] 15045 9739
assay(sce_cr, "sctrans_norm") <- normalized_data$y[rownames(sce_cr),colnames(sce_cr)]
#sce_cr$log10_total_counts
## Matrix of estimated model parameters per gene (theta and regression coefficients)
sctransform::plot_model_pars(normalized_data)
Normalised gene expression with stablised variance:
## Further, after transformation there is no relationship between gene mean and variance, as the next plot shows.
#c('Malat1', 'Rpl10', 'Cd74')
sctransform::plot_model(normalized_data, umi, c("Rpl10","Cd74","Malat1"),
plot_residual = TRUE,cell_attr = cell_attr)
Plot the top highly variance genes:
gene_attr <- normalized_data$gene_attr
gene_attr$gene_name <- rownames(gene_attr)
ggplot(gene_attr, aes(log10(gmean), residual_variance)) + geom_point(alpha=0.3, shape=16) +
geom_density_2d(size = 0.3)+geom_text(data = gene_attr[gene_attr$variance>100,],
mapping = aes(x = log10(gmean),
y = residual_variance,
label = gene_name ))+
theme_classic()
## check size factor?
## summary(sizeFactors(sce_cr))
We next run PCA using highly variable genes selected based on the residual variance of the sctransform normalised gene expression matrix.
set.seed(100)
## Number of hvgs to use
num_hvg <- 3000
## how many PCs to retain after PCA
num_pc <- 10
## order genes by residual variance and select the top 3000 genes
hvgs <- head(round(normalized_data$gene_attr[order(-normalized_data$gene_attr$residual_variance), ], 2), num_hvg)
hvgs <- rownames(hvgs)
param <- BiocParallel::MulticoreParam(workers = 10)
sce_cr <- scater::runPCA(sce_cr, ncomponents = num_pc, exprs_values = "sctrans_norm",
feature_set = hvgs, BPPARAM = param)
#reducedDim(sce_cr, "PCA") = sce_pca$x
#head(new_sce_pcs)
# number of selected PCs
ncol(reducedDim(sce_cr, "PCA"))
## [1] 10
## scree plot of PCs
plot(attr(reducedDim(sce_cr,"PCA"),"percentVar")*100,ylab = "Percentage of Variance explained")
set.seed(100)
sce_cr <- runUMAP(sce_cr, use_dimred = "PCA")
plotUMAP(sce_cr)
#sce_cr <- runTSNE(sce_cr,use_dimred = "PCA")
#sce_cr <- readRDS("sce_retina.rds")
We build SNN graph and then cluster with louvain
algorithm (or walktrap
).
library(pheatmap)
library(scater)
library(scran)
library(Polychrome)
set.seed(100) # for reproducibility
## generate random colors using function from Polychrome
snn.gr <- buildSNNGraph(sce_cr, use.dimred="PCA", k = 80, BPPARAM = param)
clusters <- igraph::cluster_walktrap(snn.gr)
table(clusters$membership)
##
## 1 2 3 4 5 6 7 8 9 10 11
## 757 408 1378 3378 757 1965 85 218 305 105 383
sce_cr$Cluster <- factor(clusters$membership)
table(sce_cr$Cluster)
##
## 1 2 3 4 5 6 7 8 9 10 11
## 757 408 1378 3378 757 1965 85 218 305 105 383
Generate color palette for our clusters :
set.seed(7654321)
cluster_colors <- createPalette(length(table(sce_cr$Cluster)),
c("#010101", "#ff0000"),
M=1000)
names(cluster_colors) <- unique(as.character(sce_cr$Cluster))
swatch(cluster_colors)
Plot the UMAP dims with cluster-colored cells
########## Function for generating labels on reduced dims plot
get_centroid <- function(dims, cluster){
## x is a vector containing the first dim
## y is a vector containing the second dim
## cluster is a factor containing the cluster label
pol = as.matrix(dims)
clusters = unique(cluster)
cets = sapply(clusters,function(cl){
r = colMeans(dims[cluster %in% cl,])
names(r) =c("x","y")
r
},USE.NAMES = TRUE)
colnames(cets) = clusters
return(cets)
}
#############
plt_df_umap <- data.frame(umap1 = reducedDim(sce_cr,"UMAP")[,1],
umap2 = reducedDim(sce_cr,"UMAP")[,2])
plt_df_umap <- cbind(plt_df_umap,data.frame(colData(sce_cr)))
label_centroid <- get_centroid(reducedDim(sce_cr,"UMAP"), cluster = sce_cr$Cluster)
text_df <- data.frame(x = label_centroid["x",],
y = label_centroid["y",],
text = colnames(label_centroid))
ggplot(data = plt_df_umap)+geom_point(mapping = aes(x = umap1,
y = umap2,
colour = Cluster),
size = 0.6)+
scale_color_manual(values = cluster_colors)+theme_classic()+geom_text(data = text_df,
mapping = aes(x = x,y =y,
label =text))+
guides(colour = guide_legend(override.aes = list(size=6)))
Plot sample-colored cells in UMAP dims
ggplot(data = plt_df_umap)+geom_point(mapping = aes(x = umap1,
y = umap2,
colour = Sample),size = 0.6,
alpha=0.7)+
scale_color_manual(values = list(LD="red", Ctrl="grey"))+theme_classic()
A little extra:
library(clustree)
clusters <- igraph::cluster_louvain(snn.gr)
sce_cr$result_1 <- factor(clusters$membership)
clusters <- igraph::cluster_walktrap(snn.gr)
sce_cr$result_2 <- factor(clusters$membership)
clustree(data.frame(colData(sce_cr)), prefix = "result_")
ggplot(data = plt_df_umap)+geom_point(mapping = aes(x = umap1,
y = umap2,
colour = Cluster),
size = 0.6,alpha = 0.5)+
scale_color_manual(values = cluster_colors)+theme_classic()+geom_text(data = text_df,
mapping = aes(x = x,y =y,
label =text))+
guides(colour = guide_legend(override.aes = list(size=6)))+facet_wrap(.~Sample)
gplt <- colData(sce_cr)
ggplot(data = data.frame(gplt))+geom_bar(mapping = aes(x = Cluster, fill = Sample),
position = "fill")+scale_fill_manual(values = list(LD="red", Ctrl="grey"))
ggplot(data = data.frame(gplt))+geom_bar(mapping = aes(x = Cluster, fill = Sample)
)+scale_fill_manual(values = list(LD="red", Ctrl="grey"))
We find the marker genes for each cluster by doing a pairwiseTTtest
for each cluster vs each of the remaining clusters and combine the results into one list.
markers_cl <- findMarkers(sce_cr, cluster = sce_cr$Cluster,
direction = "up",pval.type = "any", assay.type = "sctrans_norm")
num_genes <- 8
chosen = sapply(c(1:length(markers_cl)), function(clusterID){
marker.set = markers_cl[[clusterID]]
chosen = rownames(head(marker.set, ))
#data.frame('chosen' = as.character(chosen),
# 'clusterID' = clusterID)
chosen
})
plotHeatmap(sce_cr, features= unique(as.character(chosen)),
exprs_values="sctrans_norm", zlim=5, center=TRUE, symmetric=TRUE,
cluster_cols=FALSE, colour_columns_by="Cluster",
columns=order(sce_cr$Cluster),show_colnames=FALSE, fontsize_row = 8)
We use Microglia markers: P2ry12, Siglech, Sparc to identify microglia clusters. Microglia cells with high expression of microglia markers and low expression of myeloid markers.
Look for - Clusters mainly from Control, mostly from LD - Replicating MG : MKI67 (Marker Of Proliferation Ki-67) - Microglia Clusters large/small - monocyte-derived macrophages (mo-MFs),perivascular (pv) macrophages
Clusters of mo-MFs and perivascular (pv) macrophages were enriched for myelomonocytic genes, including Ccr2, Mrc1, Lyz2, and Cd163
plotExpression(sce_cr, x = "Cluster", features = c("P2ry12", "Siglech", "Sparc"),
exprs_values = "sctrans_norm")
multiplot(plotExpression(sce_cr, x = "Cluster", features = c( "H2-Aa"),
exprs_values = "sctrans_norm"),
plotExpression(sce_cr, x = "Cluster", features = c( "Tmem119"),
exprs_values = "sctrans_norm"),
plotExpression(sce_cr, x = "Cluster", features = c( "Lgals3"),
exprs_values = "sctrans_norm"))
multiplot(plotExpression(sce_cr, x = "Cluster", features = c( "Lpl"),
exprs_values = "sctrans_norm"),
plotExpression(sce_cr, x = "Cluster", features = c( "Spp1"),
exprs_values = "sctrans_norm"))
plotExpression(sce_cr, x = "Cluster", features = c("Ccr2", "Mrc1", "Lyz2","Mki67"),
exprs_values = "sctrans_norm")
plotHeatmap(sce_cr, features= c("P2ry12", "Siglech", "Sparc","Lag3","Ccr2", "Cd68","Mrc1", "Lyz2","Mki67"),
exprs_values="counts", zlim=5, center=TRUE, symmetric=TRUE,
cluster_cols=FALSE, colour_columns_by="Cluster", colorScale = cluster_colors,
columns=order(sce_cr$Cluster),show_colnames=FALSE, fontsize_row = 8)
#saveRDS(normalized_data$y, file = "norm_y.rds")
#saveRDS(sce_cr, file = "sce_retina.rds")
We next look for the cluster corresponding to srMG (subretinal microglia) in LD. Prior knowledge assumed as described in the original paper:
1, srMG populations are fewer relative to other populations in LD, so we focus on small clusters 2, down-regulated homeostatic genes, including Tmem119, P2ry12, and Siglech,a phenotype reported in other CNS diseases
Find which cluster is more likely to be corresponding to cluster sMG3 (srMG) in the original paper.
subsce <- sce_cr[,sce_cr$Cluster %in% c(7,11)]
subsce$Cluster <- as.character(subsce$Cluster)
markers_srmg <- findMarkers(subsce, clusters = subsce$Cluster,
assay.type = "sctrans_norm")
markers_srmg$`7`
## DataFrame with 15045 rows and 4 columns
## Top p.value FDR
## <integer> <numeric> <numeric>
## Tuba1b 1 3.79378381681197e-37 5.70774775239358e-33
## H2afz 2 4.05197086962926e-35 3.04809508667863e-31
## Rgs10 3 1.13466482534566e-33 5.69034409910847e-30
## Fcgr3 4 5.23166805044113e-32 1.96776114547217e-28
## Itm2b 5 1.25564202818163e-30 3.77822686279851e-27
## ... ... ... ...
## Tmem67 15041 0.999208402264314 0.999474131511642
## Dpm1 15042 0.999375730377185 0.999517203254054
## Ptcra 15043 0.999384332904669 0.999517203254054
## Ahr 15044 0.999615586688951 0.999682032819414
## Gm12728 15045 0.999750291570248 0.999750291570248
## logFC.11
## <numeric>
## Tuba1b 6.19146748938722
## H2afz 6.98254668196525
## Rgs10 -0.916348533582088
## Fcgr3 -1.19648062513746
## Itm2b -1.61019794421386
## ... ...
## Tmem67 -0.0001014971960209
## Dpm1 9.0454421426682e-05
## Ptcra 7.72762192773092e-05
## Ahr -2.74498672339779e-05
## Gm12728 -2.68342016308196e-05
r_plt <- data.frame(markers_srmg$`7`)
r_plt$genes <- rownames(r_plt)
ggplot(data = r_plt)+geom_point(mapping = aes(x = logFC.11, y = -log10(FDR)))+
geom_text(data = r_plt[abs(r_plt$logFC.11)>1.5 & -log10(r_plt$FDR) > 10 ,],
mapping = aes(
x = logFC.11, y = -log10(FDR), label = genes))+
theme_classic()+xlim(-3,3)
## Warning: Removed 70 rows containing missing values (geom_point).
## Warning: Removed 30 rows containing missing values (geom_text).
Refer to Figure 4E in the original paper, the down/up-regulated genes in cluster 9 matching sMG3 well.
rownames(r_plt[r_plt$logFC.11> 2 & -log10(r_plt$FDR) > 5 ,])
## [1] "Tuba1b" "H2afz" "Tubb5" "Stmn1"
## [5] "Ptma" "Ran" "Hmgn1" "Hmgb2"
## [9] "Txn1" "Tagln2" "Anp32b" "Cks1b"
## [13] "Smc2" "Calm1" "2810417H13Rik" "Ranbp1"
## [17] "Smc4" "Dek" "Cox5a" "Tk1"
## [21] "Gapdh" "Erh" "Hmgn2" "Snrpd1"
## [25] "Cenpw" "Prdx1" "Birc5" "Gmnn"
## [29] "2700094K13Rik" "Pcna" "H2afv" "Tmsb10"
## [33] "Rrm1" "Dut" "Asf1b" "Tmsb4x"
## [37] "Cdca8" "Mki67" "Dnajc9" "Mcm6"
## [41] "Tyms" "Ms4a6c" "Racgap1" "Gins2"
## [45] "Mcm5" "Pgam1" "Mcm3" "Lgals1"
## [49] "Lig1" "H2afx" "Cenpm" "Spc24"
## [53] "Ezh2" "Nasp" "Ldha" "Rfc5"
## [57] "Tcf19" "Syce2" "Cdk1" "Rrm2"
## [61] "Ccna2" "Tpx2" "Fen1" "Tubb4b"
## [65] "Apitd1" "Tacc3" "Rad51ap1" "Uhrf1"
## [69] "Rad51" "Chaf1b" "Hells" "Vim"
## [73] "Tipin" "Top2a" "Mcm2" "Mcm7"
## [77] "Mif" "Cbx5" "Ung" "Cdca3"
## [81] "Csrp1" "Slc43a3" "Prim1" "Dhfr"
## [85] "Pbk" "Knstrn" "Ccnb1" "Cdkn3"
## [89] "Cenpa" "Cks2" "Ccnb2" "Tgfbi"
## [93] "Fignl1" "Hmmr" "Clspn" "Crip1"
## [97] "Cenpe" "Pkmyt1" "Hmgb3" "Atad2"
## [101] "Dtl" "Ccne1" "Chaf1a" "Cdc45"
## [105] "Cep55"
rownames(r_plt[r_plt$logFC.11<(-2) & -log10(r_plt$FDR) > 5,])[1:10]
## [1] "Ctsd" "Ctsb" "Cst7" NA NA NA NA NA NA NA
Practise Run this DE analysis for other small microglia cluster from LD group.
We next run monocle2 as described in the paper, as they stated that the srMG cluster is expected in the later development stages because it is particularly distinct from the steady-state (Cluster 1/2 from control) (As shown in Figure 5A). And
library(monocle)
library(dplyr)
lyz_cluster <- assay(sce_cr,"sctrans_norm")["Lyz2",]
lyz_cluster <- reshape2::melt(lyz_cluster)
lyz_cluster$cell <- rownames(lyz_cluster)
lyz_cluster$Cluster <- sce_cr$Cluster
lyz_cluster <- lyz_cluster %>% group_by(Cluster) %>% summarize(mean_lyz2 = mean(value))
lyz_cluster <- lyz_cluster$Cluster[lyz_cluster$mean_lyz2 > quantile(lyz_cluster$mean_lyz2,.75 )]
rep_MG <- assay(sce_cr,"sctrans_norm")["Mki67",]
rep_MG <- reshape2::melt(rep_MG)
rep_MG$cell <- rownames(rep_MG)
rep_MG$Cluster <- sce_cr$Cluster
rep_MG <- rep_MG %>% group_by(Cluster) %>% summarize(mean_mki67 = mean(value))
rep_MG <- rep_MG$Cluster[rep_MG$mean_mki67 > quantile(rep_MG$mean_mki67,.85 )]
d <- sce_cr[which(rownames(sce_cr) %in% hvgs[1:1000]), which(! sce_cr$Cluster %in% c(rep_MG,lyz_cluster))]
d <- d[!duplicated(rownames(d)), ]
colnames(d) <- 1:ncol(d)
geneNames <- rownames(d)
rownames(d) <- 1:nrow(d)
pd <- data.frame(colData(d) )
pd <- new("AnnotatedDataFrame", data=pd)
fd <- data.frame(gene_short_name = geneNames)
fd <- new("AnnotatedDataFrame", data=fd)
dCellData <- newCellDataSet(counts(d), phenoData = pd, featureData = fd)
#
dCellData <- setOrderingFilter(dCellData, which(geneNames %in% hvgs[1:1000]))
dCellData <- estimateSizeFactors(dCellData)
dCellDataSet <- reduceDimension(dCellData,reduction_method = "DDRTree", pseudo_expr = 1)
dCellDataSet <- orderCells(dCellDataSet, reverse = FALSE)
plot_cell_trajectory(dCellDataSet,color_by = "Pseudotime")
plot_cell_trajectory(dCellDataSet,color_by = "Sample")+scale_color_manual(values = list(Ctrl = "grey",
LD = "red"))+facet_wrap(.~Cluster)
plot_cell_trajectory(dCellDataSet,color_by = "Cluster")+
scale_color_manual(values = cluster_colors)+facet_wrap(.~Sample)+
guides(colour = guide_legend(override.aes = list(size=6)))
plot_cell_trajectory(dCellDataSet,color_by = "State")+facet_wrap(.~Sample)
d <- sce_cr[which(rownames(sce_cr) %in% hvgs[1:1000]), which(sce_cr$Cluster %in% c(11,8,7,13,14))]
d <- d[!duplicated(rownames(d)), ]
colnames(d) <- 1:ncol(d)
geneNames <- rownames(d)
rownames(d) <- 1:nrow(d)
pd <- data.frame(colData(d) )
pd <- new("AnnotatedDataFrame", data=pd)
fd <- data.frame(gene_short_name = geneNames)
fd <- new("AnnotatedDataFrame", data=fd)
dCellData <- newCellDataSet(counts(d), phenoData = pd, featureData = fd)
#
dCellData <- setOrderingFilter(dCellData, which(geneNames %in% hvgs[1:1000]))
dCellData <- estimateSizeFactors(dCellData)
dCellDataSet <- reduceDimension(dCellData,reduction_method = "DDRTree", pseudo_expr = 1)
dCellDataSet <- orderCells(dCellDataSet, reverse = FALSE)
#dCellDataSet <- orderCells(dCellDataSet, reverse = FALSE, root_state = 11)
plot_cell_trajectory(dCellDataSet,color_by = "Cluster")+scale_color_manual(values = cluster_colors)+
guides(colour = guide_legend(override.aes = list(size=6)))
plot_cell_trajectory(dCellDataSet,color_by = "Pseudotime")
plot_cell_trajectory(dCellDataSet,color_by = "State")
# Store the ordering
pseudotime_monocle2 <-
data.frame(
Cluster = phenoData(dCellDataSet)$Cluster,
pseudotime = phenoData(dCellDataSet)$Pseudotime,
State = phenoData(dCellDataSet)$State
)
ggplot(data = pseudotime_monocle2)+geom_boxplot(mapping = aes(x = Cluster,
y = pseudotime))