Newer
Older
#' Compare SCESet objects
#'
#' Combine the data from several SCESet objects and produce some basic plots
#' comparing.
#'
#' @param sces named list of SCESet objects to combine and compare.
#'
#' @details
#' The return list has three items:
#'
#' \describe{
#' \item{\code{FeatureData}}{Combined feature data from the provided SCESets.}
#' \item{\code{PhenoData}}{Combined pheno data from the provided SCESets.}
#' \item{\code{Plots}}{Comparison plots
#' \describe{
#' \item{\code{Means}}{Violin plot of mean distribution.}
#' \item{\code{Variances}}{Violin plot of variance distribution.}
#' \item{\code{MeanVar}}{Scatter plot with fitted lines showing the
#' mean-variance relationship.}
#' \item{\code{LibraySizes}}{Boxplot of the library size distribution.}
#' \item{\code{ZerosGene}}{Boxplot of the percentage of each gene that is
#' zero.}
#' \item{\code{ZerosCell}}{Boxplot of the percentage of each cell that is
#' zero.}
#' }
#' }
#' }
#'
#' The plots returned by this function are created using
#' \code{\link[ggplot2]{ggplot}} and are only a sample of the kind of plots you
#' might like to consider. The data used to create these plots is also returned
#' and should be in the correct format to allow you to create further plots
#' using \code{\link[ggplot2]{ggplot}}.
#'
#' @return List containing the combined datasets and plots.
#' @examples
#' sim2 <- simpleSimulate(nGenes = 1000, nCells = 20)
#' comparison <- compareSCESets(list(Splat = sim1, Simple = sim2))
#' names(comparison)
#' names(comparison$Plots)
#' @importFrom ggplot2 ggplot aes geom_point geom_smooth geom_boxplot
#' geom_violin scale_y_continuous scale_y_log10 xlab ylab ggtitle theme_minimal
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
compareSCESets <- function(sces) {
checkmate::assertList(sces, types = "SCESet", any.missing = FALSE,
min.len = 1, names = "unique")
for (name in names(sces)) {
sce <- sces[[name]]
fData(sce)$Dataset <- name
pData(sce)$Dataset <- name
sce <- scater::calculateQCMetrics(sce)
cpm(sce) <- edgeR::cpm(counts(sce))
sce <- addFeatureStats(sce, "counts")
sce <- addFeatureStats(sce, "cpm")
sce <- addFeatureStats(sce, "cpm", log = TRUE)
sces[[name]] <- sce
}
fData.all <- fData(sces[[1]])
fData.all$Dataset <- names(sces)[[1]]
pData.all <- pData(sces[[1]])
pData.all$Dataset <- names(sces)[[1]]
if (length(sces) > 1) {
for (name in names(sces)[-1]) {
fData.all <- rbindMatched(fData.all, fData(sce))
pData.all <- rbindMatched(pData.all, pData(sce))
}
}
means <- ggplot(fData.all,
aes(x = Dataset, y = mean_log_cpm, colour = Dataset)) +
geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) +
ylab(expression(paste("Mean ", log[2], "(CPM + 1)"))) +
ggtitle("Distribution of mean expression") +
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
vars <- ggplot(fData.all,
aes(x = Dataset, y = var_cpm, colour = Dataset)) +
geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) +
scale_y_log10(labels = scales::comma) +
ylab("CPM Variance") +
ggtitle("Distribution of variance") +
theme_minimal()
mean.var <- ggplot(fData.all,
aes(x = mean_log_cpm, y = var_log_cpm, colour = Dataset,
fill = Dataset)) +
geom_point() +
geom_smooth() +
xlab(expression(paste("Mean ", log[2], "(CPM + 1)"))) +
ylab(expression(paste("Variance ", log[2], "(CPM + 1)"))) +
ggtitle("Mean-variance relationship") +
theme_minimal()
libs <- ggplot(pData.all,
aes(x = Dataset, y = total_counts, colour = Dataset)) +
geom_boxplot() +
scale_y_continuous(labels = scales::comma) +
ylab("Total counts per cell") +
ggtitle("Distribution of library sizes") +
theme_minimal()
z.gene <- ggplot(fData.all,
aes(x = Dataset, y = pct_dropout, colour = Dataset)) +
geom_boxplot() +
scale_y_continuous(limits = c(0, 100)) +
ylab("Percentage zeros per gene") +
ggtitle("Distribution of zeros per gene") +
theme_minimal()
z.cell <- ggplot(pData.all,
aes(x = Dataset, y = pct_dropout, colour = Dataset)) +
geom_boxplot() +
scale_y_continuous(limits = c(0, 100)) +
ylab("Percentage zeros per cell") +
ggtitle("Distribution of zeros per cell") +
theme_minimal()
comparison <- list(FeatureData = fData.all,
PhenoData = pData.all,
Plots = list(Means = means,
Variances = vars,
MeanVar = mean.var,
LibrarySizes = libs,
ZerosGene = z.gene,
ZerosCell = z.cell))
return(comparison)
}