Newer
Older
#' Add feature statistics
#'
#' Add additional feature statistics to an SCESet object
#'
#' @param sce SCESet to add feature statistics to.
#' @param value the expression value to calculate statistics for. Options are
#' "counts", "cpm", "tpm" or "fpkm". The values need to exist in the
#' given SCESet.
#' @param log logical. Whether to take log2 before calculating statistics.
#' @param offset offset to add to avoid taking log of zero.
#' @param no.zeros logical. Whether to remove all zeros from each feature before
#' calculating statistics.
#'
#' @details
#' Currently adds the following statistics: mean, variance, coefficient of
#' variation, median and median absolute deviation. Statistics are added to
#' the \code{fData} slot and are named \code{Stat[Log]Value[No0]} where
#' \code{Log} and \code{No0} are added if those arguments are true.
#' UpperCamelCase is used to differentiate these columns from those added by
#' \code{scater}.
#'
#' @return SCESet with additional feature statistics
#'
#' @importFrom Biobase fData fData<-
addFeatureStats <- function(sce, value = c("counts", "cpm", "tpm", "fpkm"),
log = FALSE, offset = 1, no.zeros = FALSE) {
value <- match.arg(value)
switch(value,
counts = {
values = scater::counts(sce)
}
)
if (no.zeros) {
values[values == 0] <- NA
}
if (log) {
values = log2(values + offset)
mean.str <- paste0("Mean", suffix)
var.str <- paste0("Var", suffix)
cv.str <- paste0("CV", suffix)
med.str <- paste0("Med", suffix)
mad.str <- paste0("MAD", suffix)
fData(sce)[, mean.str] <- rowMeans(values, na.rm = TRUE)
fData(sce)[, var.str] <- matrixStats::rowVars(values, na.rm = TRUE)
fData(sce)[, cv.str] <- sqrt(fData(sce)[, var.str]) /
fData(sce)[, mean.str]
fData(sce)[, med.str] <- matrixStats::rowMedians(values, na.rm = TRUE)
fData(sce)[, mad.str] <- matrixStats::rowMads(values, na.rm = TRUE)
return(sce)
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
}
#' Add gene lengths
#'
#' Add gene lengths to an SCESet object
#'
#' @param sce SCESet to add gene lengths to.
#' @param method Method to use for creating lengths.
#' @param loc Location parameter for the generate method.
#' @param scale Scale parameter for the generate method.
#' @param lengths Vector of lengths for the sample method.
#'
#' @details
#' This function adds simulated gene lengths to the \code{fData} slot of an
#' \code{SCESet} object that can be used for calculating length normalised
#' expression values such as TPM or FPKM. The \code{generate} simulates lengths
#' using a (rounded) log-normal distribution, with the default \code{loc} and
#' \code{scale} parameters based on human coding genes. Alternatively the
#' \code{sample} method can be used which randomly samples lengths (with
#' replacement) from a supplied vector.
#'
#' @return SCESet with added gene lengths
#' @examples
#' # Default generate method
#' sce <- simpleSimulate()
#' sce <- addGeneLengths(sce)
#' head(fData(sce))
#' # Sample method (human coding genes)
#' \dontrun{
#' library(TxDb.Hsapiens.UCSC.hg19.knownGene)
#' library(GenomicFeatures)
#' txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
#' tx.lens <- transcriptLengths(txdb, with.cds_len = TRUE)
#' tx.lens <- tx.lens[tx.lens$cds_len > 0, ]
#' gene.lens <- max(splitAsList(tx.lens$tx_len, tx.lens$gene_id))
#' sce <- addGeneLengths(sce, method = "sample", lengths = gene.lens)
#' }
#' @export
#' @importFrom stats rlnorm
addGeneLengths <- function(sce, method = c("generate", "sample"), loc = 7.9,
scale = 0.7, lengths = NULL) {
method <- match.arg(method)
checkmate::assertClass(sce, "SCESet")
checkmate::assertNumber(loc)
checkmate::assertNumber(scale, lower = 0)
checkmate::assertNumeric(lengths, lower = 0, null.ok = TRUE)
switch(method,
generate = {
sim.lengths <- rlnorm(nrow(sce), meanlog = loc, sdlog = scale)
sim.lengths <- round(sim.lengths)
},
sample = {
if (is.null(lengths)) {
stop("Lengths must be supplied to use the sample method.")
} else {
sim.lengths <- sample(lengths, nrow(sce), replace = TRUE)
}
}
)
fData(sce)$Length <- sim.lengths
return(sce)
}
#txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene
#tx.lens <- GenomicFeatures::transcriptLengths(txdb, with.cds_len = TRUE)
#tx.lens <- tx.lens[tx.lens$cds_len > 0, ]
#gene.lens <- max(IRanges::splitAsList(tx.lens$tx_len, tx.lens$gene_id))
#lens <- rlnorm(length(gene.lens), meanlog = 7.9, sdlog = 0.7)