From 5daced0131d72495ec10a6edc45ab3acb42ee6c6 Mon Sep 17 00:00:00 2001
From: Luke Zappia <lazappi@users.noreply.github.com>
Date: Tue, 4 Jun 2019 19:24:06 +1000
Subject: [PATCH] Refactor summariseDiff

Split into functions, can now actually tell how it works...
---
 NAMESPACE             |   1 +
 R/compare.R           | 257 +++++++++++++++++++-----------------------
 man/summariseStats.Rd |  25 ++++
 man/tidyStatSumm.Rd   |  19 ++++
 4 files changed, 159 insertions(+), 143 deletions(-)
 create mode 100644 man/summariseStats.Rd
 create mode 100644 man/tidyStatSumm.Rd

diff --git a/NAMESPACE b/NAMESPACE
index b0706b9..6b06d02 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -121,6 +121,7 @@ importFrom(methods,new)
 importFrom(methods,slot)
 importFrom(methods,slotNames)
 importFrom(methods,validObject)
+importFrom(stats,aggregate)
 importFrom(stats,dnbinom)
 importFrom(stats,median)
 importFrom(stats,model.matrix)
diff --git a/R/compare.R b/R/compare.R
index 3f3c618..9ef5aea 100644
--- a/R/compare.R
+++ b/R/compare.R
@@ -828,149 +828,120 @@ makeOverallPanel <- function(comp, diff, title = "Overall comparison",
 #' @export
 summariseDiff <- function(diff) {
 
-    datasets <- unique(diff$ColData$Dataset)
-
-    rowData.mads <- sapply(datasets, function(dataset) {
-        df <- diff$RowData[diff$RowData$Dataset == dataset, ]
-        mean <- median(abs(df$RankDiffMeanLogCPM))
-        var <- median(abs(df$RankDiffVarLogCPM))
-        zeros <- median(abs(df$RankDiffZeros))
-        mean.var <- median(abs(df$MeanRankVarDiff))
-        mean.zeros <- median(abs(df$MeanRankZerosDiff))
-        return(c(Mean = mean, Variance = var, ZerosGene = zeros,
-                 MeanVar = mean.var, MeanZeros = mean.zeros))
-    })
-    rowData.mads.z <- t(scale(t(rowData.mads)))
-
-    colData.mads <- sapply(datasets, function(dataset) {
-        df <- diff$ColData[diff$ColData$Dataset == dataset, ]
-        lib.size <- median(abs(df$RankDiffLibSize))
-        zeros <- median(abs(df$RankDiffZeros))
-        return(c(LibSize = lib.size, ZerosCell = zeros))
-    })
-    colData.mads.z <- t(scale(t(colData.mads)))
-
-    mads <- data.frame(Dataset = datasets, t(rowData.mads), t(colData.mads))
-    mads.z <- data.frame(Dataset = datasets, t(rowData.mads.z),
-                         t(colData.mads.z))
-
-    rowData.ranks <- matrixStats::rowRanks(rowData.mads)
-    colData.ranks <- matrixStats::rowRanks(colData.mads)
-
-    ranks.mads <- data.frame(Dataset = datasets, t(rowData.ranks),
-                             t(colData.ranks))
-    colnames(ranks.mads) <- paste0(colnames(mads), "Rank")
-
-    rowData.maes <- sapply(datasets, function(dataset) {
-        df <- diff$RowData[diff$RowData$Dataset == dataset, ]
-        mean <- mean(abs(df$RankDiffMeanLogCPM))
-        var <- mean(abs(df$RankDiffVarLogCPM))
-        zeros <- mean(abs(df$RankDiffZeros))
-        mean.var <- mean(abs(df$MeanRankVarDiff))
-        mean.zeros <- mean(abs(df$MeanRankZerosDiff))
-        return(c(Mean = mean, Variance = var, ZerosGene = zeros,
-                 MeanVar = mean.var, MeanZeros = mean.zeros))
-    })
-    rowData.maes.z <- t(scale(t(rowData.maes)))
-
-    colData.maes <- sapply(datasets, function(dataset) {
-        df <- diff$ColData[diff$ColData$Dataset == dataset, ]
-        lib.size <- mean(abs(df$RankDiffLibSize))
-        zeros <- mean(abs(df$RankDiffZeros))
-        return(c(LibSize = lib.size, ZerosCell = zeros))
-    })
-    colData.maes.z <- t(scale(t(colData.maes)))
-
-    maes <- data.frame(Dataset = datasets, t(rowData.maes), t(colData.maes))
-    maes.z <- data.frame(Dataset = datasets, t(rowData.maes.z),
-                         t(colData.maes.z))
-
-    rowData.ranks <- matrixStats::rowRanks(rowData.maes)
-    colData.ranks <- matrixStats::rowRanks(colData.maes)
-
-    ranks.maes <- data.frame(Dataset = datasets, t(rowData.ranks), t(colData.ranks))
-    colnames(ranks.maes) <- paste0(colnames(mads), "Rank")
-
-    rowData.rmse <- sapply(datasets, function(dataset) {
-        df <- diff$RowData[diff$RowData$Dataset == dataset, ]
-        mean <- sqrt(mean(df$RankDiffMeanLogCPM ^ 2))
-        var <- sqrt(mean(df$RankDiffVarLogCPM ^ 2))
-        zeros <- sqrt(mean(df$RankDiffZeros ^ 2))
-        mean.var <- sqrt(mean(df$MeanRankVarDiff ^ 2))
-        mean.zeros <- sqrt(mean(df$MeanRankZerosDiff ^ 2))
-        return(c(Mean = mean, Variance = var, ZerosGene = zeros,
-                 MeanVar = mean.var, MeanZeros = mean.zeros))
-    })
-    rowData.rmse.z <- t(scale(t(rowData.rmse)))
-
-    colData.rmse <- sapply(datasets, function(dataset) {
-        df <- diff$ColData[diff$ColData$Dataset == dataset, ]
-        lib.size <- sqrt(mean(df$RankDiffLibSize ^ 2))
-        zeros <- sqrt(mean(df$RankDiffZeros ^ 2))
-        return(c(LibSize = lib.size, ZerosCell = zeros))
-    })
-    colData.rmse.z <- t(scale(t(colData.rmse)))
-
-    rmse <- data.frame(Dataset = datasets, t(rowData.rmse), t(colData.rmse))
-    rmse.z <- data.frame(Dataset = datasets, t(rowData.rmse.z),
-                         t(colData.rmse.z))
-
-    rowData.ranks <- matrixStats::rowRanks(rowData.rmse)
-    colData.ranks <- matrixStats::rowRanks(colData.rmse)
-
-    ranks.rmse <- data.frame(Dataset = datasets, t(rowData.ranks),
-                             t(colData.ranks))
-    colnames(ranks.rmse) <- paste0(colnames(rmse), "Rank")
-
-    mads <- stats::reshape(mads, varying = 2:8, direction = "long",
-                           idvar = "Dataset", timevar = "Statistic",
-                           times = colnames(mads)[2:8], v.names = "MAD")
-
-    mads.z <- stats::reshape(mads.z, varying = 2:8, direction = "long",
-                             idvar = "Dataset", timevar = "Statistic",
-                             times = colnames(mads)[2:8],
-                             v.names = "MADScaled")
-
-    ranks.mads <- stats::reshape(ranks.mads, varying = 2:8, direction = "long",
-                                 idvar = "Dataset", timevar = "Statistic",
-                                 times = colnames(ranks.mads)[2:8],
-                                 v.names = "Rank")
-
-    maes <- stats::reshape(maes, varying = 2:8, direction = "long",
-                           idvar = "Dataset", timevar = "Statistic",
-                           times = colnames(maes)[2:8], v.names = "MAE")
-
-    maes.z <- stats::reshape(maes.z, varying = 2:8, direction = "long",
-                             idvar = "Dataset", timevar = "Statistic",
-                             times = colnames(mads)[2:8],
-                             v.names = "MAEScaled")
-
-    ranks.maes <- stats::reshape(ranks.maes, varying = 2:8, direction = "long",
-                                 idvar = "Dataset", timevar = "Statistic",
-                                 times = colnames(ranks.maes)[2:8],
-                                 v.names = "Rank")
-
-    rmse <- stats::reshape(rmse, varying = 2:8, direction = "long",
-                           idvar = "Dataset", timevar = "Statistic",
-                           times = colnames(mads)[2:8], v.names = "RMSE")
-
-    rmse.z <- stats::reshape(rmse.z, varying = 2:8, direction = "long",
-                             idvar = "Dataset", timevar = "Statistic",
-                             times = colnames(mads)[2:8],
-                             v.names = "RMSEScaled")
-
-    ranks.rmse <- stats::reshape(ranks.rmse, varying = 2:8, direction = "long",
-                                 idvar = "Dataset", timevar = "Statistic",
-                                 times = colnames(ranks.rmse)[2:8],
-                                 v.names = "Rank")
-
-    summary <- data.frame(mads, MADScaled = mads.z$MADScaled,
-                          MADRank = ranks.mads$Rank,
-                          MAE = maes$MAE, MAEScaled = maes.z$MAEScaled,
-                          MAERank = ranks.maes$Rank,
-                          RMSE = rmse$RMSE, RMSEScaled = rmse.z$RMSEScaled,
-                          RMSERank = ranks.rmse$Rank)
-    row.names(summary) <- NULL
+    row.stats <- c(Mean = "RankDiffMeanLogCPM",
+                   Variance = "RankDiffVarLogCPM",
+                   ZerosGene = "RankDiffZeros",
+                   MeanVar = "MeanRankVarDiff",
+                   MeanZeros = "MeanRankZerosDiff")
+
+    row.mad <- summariseStats(diff$RowData, "Dataset", row.stats, "MAD")
+    row.mae <- summariseStats(diff$RowData, "Dataset", row.stats, "MAE")
+    row.rmse <- summariseStats(diff$RowData, "Dataset", row.stats, "RMSE")
+
+    row.list <- list(row.mad, row.mae, row.rmse)
+    row.list <- lapply(row.list, function(summ) {summ[, -c(1, 2)]})
+    row.summ <- data.frame(Dataset = row.mad$Dataset,
+                           Statistic = row.mad$Statistic)
+    row.list <- c(row.summ, row.list)
+    row.summ <- do.call("cbind", row.list)
+
+    col.stats <- c(LibSize = "RankDiffLibSize",
+                   ZerosCell = "RankDiffZeros")
+
+    col.mad <- summariseStats(diff$ColData, "Dataset", col.stats, "MAD")
+    col.mae <- summariseStats(diff$ColData, "Dataset", col.stats, "MAE")
+    col.rmse <- summariseStats(diff$ColData, "Dataset", col.stats, "RMSE")
+
+    col.list <- list(col.mad, col.mae, col.rmse)
+    col.list <- lapply(col.list, function(summ) {summ[, -c(1, 2)]})
+    col.summ <- data.frame(Dataset = col.mad$Dataset,
+                           Statistic = col.mad$Statistic)
+    col.list <- c(col.summ, col.list)
+    col.summ <- do.call("cbind", col.list)
+
+    summary <- rbind(row.summ, col.summ)
 
     return(summary)
 }
+
+#' Summarise statistics
+#'
+#' Summarise columns of a data.frame using a single measure.
+#'
+#' @param data The data.frame to summarise
+#' @param split.col Name of the column used to split the dataset
+#' @param stat.cols Names of the columns to summarise. If this vector is named
+#' those names will be used in the output.
+#' @param measure The measure to use for summarisation.
+#'
+#' @return data.frame with the summarised measure, scaled and ranked
+#'
+#' @importFrom stats aggregate
+summariseStats <- function(data, split.col, stat.cols,
+                           measure = c("MAD", "MAE", "RMSE")) {
+
+    measure <- match.arg(measure)
+
+    if (is.null(names(stat.cols))) {
+        names(stat.cols) <- stat.cols
+    }
+
+    switch (measure,
+            "MAD" = {
+                measure_fun <- function(x) {median(abs(x))}
+            },
+            "MAE" = {
+                measure_fun <- function(x) {mean(abs(x))}
+            },
+            "RMSE" = {
+                measure_fun <- function(x) {sqrt(mean(abs(x ^ 2)))}
+            }
+    )
+
+    summ <- aggregate(data[, stat.cols], list(Dataset = data[[split.col]]),
+                      measure_fun)
+    colnames(summ) <- c(split.col, names(stat.cols))
+
+    tidy.summ <- tidyStatSumm(summ, measure)
+
+    return(tidy.summ)
+}
+
+#' Tidy summarised statistics
+#'
+#' Convert a statistic summary to tidy format and add ranks and scaled values
+#'
+#' @param stat.summ The summary to convert
+#' @param measure The name of the summarisation measure
+#'
+#' @return tidy data.frame with the summarised measure, scaled and ranked
+tidyStatSumm <- function(stat.summ, measure = c("MAD", "MAE", "RMSE")) {
+
+    measure <- match.arg(measure)
+
+    summ.mat <- t(stat.summ[, -1])
+    colnames(summ.mat) <- stat.summ[, 1]
+
+    scale.summ <- apply(summ.mat, 1, scale)
+    # Check if apply has returned a vector
+    if (is.vector(scale.summ)) {
+        scale.summ <- t(as.matrix(scale.summ))
+    }
+
+    rank.summ <- apply(summ.mat, 1, rank)
+    if (is.vector(rank.summ)) {
+        rank.summ <- t(as.matrix(rank.summ))
+    }
+
+    tidy.summ <- as.data.frame.table(t(summ.mat))
+    colnames(tidy.summ) <- c("Dataset", "Statistic", measure)
+
+    tidy.scale <- as.data.frame.table(scale.summ)
+    tidy.rank <- as.data.frame.table(rank.summ)
+
+    tidy.summ[[paste0(measure, "Scaled")]] <- tidy.scale[, 3]
+    tidy.summ[[paste0(measure, "Rank")]] <- tidy.rank[, 3]
+
+    return(tidy.summ)
+}
+
diff --git a/man/summariseStats.Rd b/man/summariseStats.Rd
new file mode 100644
index 0000000..72459e4
--- /dev/null
+++ b/man/summariseStats.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compare.R
+\name{summariseStats}
+\alias{summariseStats}
+\title{Summarise statistics}
+\usage{
+summariseStats(data, split.col, stat.cols, measure = c("MAD", "MAE",
+  "RMSE"))
+}
+\arguments{
+\item{data}{The data.frame to summarise}
+
+\item{split.col}{Name of the column used to split the dataset}
+
+\item{stat.cols}{Names of the columns to summarise. If this vector is named
+those names will be used in the output.}
+
+\item{measure}{The measure to use for summarisation.}
+}
+\value{
+data.frame with the summarised measure, scaled and ranked
+}
+\description{
+Summarise columns of a data.frame using a single measure.
+}
diff --git a/man/tidyStatSumm.Rd b/man/tidyStatSumm.Rd
new file mode 100644
index 0000000..fb49489
--- /dev/null
+++ b/man/tidyStatSumm.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compare.R
+\name{tidyStatSumm}
+\alias{tidyStatSumm}
+\title{Tidy summarised statistics}
+\usage{
+tidyStatSumm(stat.summ, measure = c("MAD", "MAE", "RMSE"))
+}
+\arguments{
+\item{stat.summ}{The summary to convert}
+
+\item{measure}{The name of the summarisation measure}
+}
+\value{
+tidy data.frame with the summarised measure, scaled and ranked
+}
+\description{
+Convert a statistic summary to tidy format and add ranks and scaled values
+}
-- 
GitLab