#' Estimate eQTL simulation parameters
#'
#' Estimate simulation parameters for the eQTL population simulation from
#' real data. See the individual estimation functions for more details on
#' how this is done. 
#'
#' @param eQTLparams eQTLParams object containing parameters for the 
#'         simulation of the mean expression levels for the population.
#'        See \code{\link{eQTLParams}} for details.
#' @param all.pairs Txt file with all eQTL pairs from a bulk eQTL analysis. Must
#'        include 3 columns: 'gene_id', 'pval_nominal', and 'slope'.
#' @param gene.means Dataframe of real gene means across a population, where 
#'        each row is a gene and each column is an individual in the population.
#'
#' @seealso
#' \code{\link{eQTLEstES}},  \code{\link{eQTLEstMeanCV}}
#'
#' @return eQTLParams object containing the estimated parameters.
#'
#' @examples
#' # Load example data
#' data(ex_means)
#' data(ex_pairs)
#' eQTLparams <- eQTLEstimate()
#' eQTLparams
#' 
#' @export
eQTLEstimate <- function(eQTLparams = neweQTLParams(),
                         all.pairs = ex_pairs,
                         gene.means = ex_means){
    
    checkmate::assertClass(eQTLparams, "eQTLParams")
    
    # Get parameters for eQTL Effect Size distribution
    eQTLparams <- eQTLEstES(all.pairs, eQTLparams)
        
    # Get parameters for population wide gene mean and variance distributions
    eQTLparams <- eQTLEstMeanCV(gene.means, eQTLparams)

    return(eQTLparams)
}

#' Estimate eQTL Effect Size parameters
#'
#' Estimate rate and shape parameters for the gamma distribution used to
#' simulate eQTL (eSNP-eGene) effect sizes.
#'
#' @param all.pairs Txt file with all eQTL pairs from a bulk eQTL analysis. Must
#'        include 3 columns: 'gene_id', 'pval_nominal', and 'slope'. 
#' @param eQTLparams eQTLParams object containing parameters for the 
#'         simulation of the mean expression levels for the population.
#'        See \code{\link{eQTLParams}} for details.
#'
#' @details
#' Parameters for the gamma distribution are estimated by fitting the top eSNP-
#' eGene pair effect sizes using \code{\link[fitdistrplus]{fitdist}}. The 
#' 'maximum goodness-of-fit estimation' method is used to minimise the 
#' Cramer-von Mises distance. This can fail in some situations, in which case 
#' the 'method of moments estimation' method is used instead. 
#'
#' @return eQTLparams object with estimated values.
#' @importFrom data.table data.table .I
eQTLEstES <- function(all.pairs, eQTLparams) {

    # Test input eSNP-eGene pairs
    if (!('gene_id' %in% names(all.pairs) &
          'pval_nominal' %in% names(all.pairs))) {
        stop("Incorrect format for all.pairs. See example data.")
    }
    
    # Select top eSNP for each gene (i.e. lowest p.value)
    all.pairs <- data.table::data.table(all.pairs)
    pairs_top <- all.pairs[all.pairs[, .I[which.min(pval_nominal)], 
                                     by='gene_id']$V1]

    # Fit absolute value of effect sizes to gamma distribution
    e.sizes <- abs(pairs_top$slope)
    fit <- fitdistrplus::fitdist(e.sizes, "gamma", method = "mge", gof = "CvM")
    
    if (fit$convergence > 0) {
        warning("Fitting effect sizes using the Goodness of Fit method failed,",
                " using the Method of Moments instead")
        fit <- fitdistrplus::fitdist(e.sizes, "gamma", method = "mme")
    }
    
    eQTLparams <- setParams(eQTLparams, 
                            eqtlES.shape = unname(fit$estimate["shape"]),
                            eqtlES.rate = unname(fit$estimate["rate"]))

    return(eQTLparams)
}

#' Estimate gene mean and gene mean variance parameters
#'
#' The Shapiro-Wilks test is used to determine if the library sizes are
#' normally distributed. If so a normal distribution is fitted to the library
#' sizes, if not (most cases) a log-normal distribution is fitted and the
#' estimated parameters are added to the params object. See
#' \code{\link[fitdistrplus]{fitdist}} for details on the fitting.
#'
#' @param eQTLparams eQTLParams object containing parameters for the 
#'         simulation of the mean expression levels for the population.
#'        See \code{\link{eQTLParams}} for details.
#' @param gene.means Dataframe of real gene means across a population, where 
#'        each row is a gene and each column is an individual in the population.
#'
#' @details
#' Parameters for the mean gamma distribution are estimated by fitting the mean
#' (across the population) expression of genes that meet the criteria (<50% of
#' samples have exp <0.1) and parameters for the cv gamma distribution are 
#' estimated for each bin of mean expression using the cv of expression across
#' the population for genes in that bin. Both are fit using 
#' \code{\link[fitdistrplus]{fitdist}}. The "Nelder-Mead" method is used to fit
#' the mean gamma distribution and the 'maximum goodness-of-fit estimation' 
#' method is used to minimise the Cramer-von Mises distance for the CV 
#' distribution.
#' 
#' @return eQTLParams object with estimated values.
#' @importFrom stats quantile
eQTLEstMeanCV <- function(gene.means, eQTLparams) {
    
    # Test input gene means
    if ((anyNA(gene.means) | !(validObject(rowSums(gene.means))))) {
        stop("Incorrect format or NAs present in gene.means. See example data.")
    }
    
    # Remove genes with low variance/low means
    abv_thr <- data.frame(perc = (rowSums(gene.means >= 0.1)/ncol(gene.means)))
    genes <- row.names(subset(abv_thr, perc > 0.5))
    gene.means <- gene.means[genes, ]
    
    # Calculate mean expression parameters
    means <- rowMeans(gene.means)
    mfit <- fitdistrplus::fitdist(means, "gamma", optim.method="Nelder-Mead")
    
    # Calculate CV parameters for genes based on 10 expresion mean bins
    print(eQTLparams)
    nbins <- getParam(eQTLparams, "bulkcv.bins")
    bins <- split(means, cut(means, quantile(means,(0:nbins)/nbins), include.lowest=T))
    cvparams <- data.frame(start = character(), end = character(),
                           shape = character(), rate = character(), 
                           stringsAsFactors=FALSE)
    for(b in names(bins)){
        stst <- unlist(strsplit(gsub("\\)|\\(|\\[|\\]", "", b), ','))
        b_genes <- names(unlist(bins[b], use.names = T))
        b_genes <- gsub(paste0(b, '.'), '', b_genes, fixed=T)
        b_gene.means <- gene.means[b_genes, ]
        cv <- apply(b_gene.means, 1, co.var)
        cv[is.na(cv)] <- 0
        cvfit <- fitdistrplus::fitdist(cv, "gamma", method = "mge", gof = "CvM")
        cvparams <- rbind(cvparams, 
                          list(start= as.numeric(stst[1]),
                               end= as.numeric(stst[2]), 
                               shape=cvfit$estimate['shape'],
                               rate=cvfit$estimate['rate']),
                          stringsAsFactors=FALSE)
    }
    cvparams[1, 'start'] <- 0
    cvparams[nrow(cvparams), 'end'] <- 1e100
    eQTLparams <- setParams(eQTLparams, 
                             bulkmean.shape = unname(mfit$estimate["shape"]),
                             bulkmean.rate = unname(mfit$estimate["rate"]),
                             bulkcv.param = cvparams)

    return(eQTLparams)
}