diff --git a/DESCRIPTION b/DESCRIPTION index 2a4729c..97f9c48 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -35,4 +35,5 @@ Imports: tidyr, tidyselect Suggests: + dots, roxygen2 diff --git a/NAMESPACE b/NAMESPACE index b6ba566..f990e42 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -50,6 +50,7 @@ importFrom(dplyr,mutate) importFrom(dplyr,mutate_if) importFrom(dplyr,n) importFrom(dplyr,rename) +importFrom(dplyr,row_number) importFrom(dplyr,select) importFrom(dplyr,slice) importFrom(dplyr,slice_max) @@ -80,6 +81,7 @@ importFrom(stats,qbeta) importFrom(stats,uniroot) importFrom(stringr,str_replace) importFrom(stringr,str_replace_all) +importFrom(stringr,str_sub) importFrom(tibble,add_column) importFrom(tibble,add_row) importFrom(tibble,as_tibble) diff --git a/R/mash-evaluation.R b/R/mash-evaluation.R new file mode 100644 index 0000000..52b6fdf --- /dev/null +++ b/R/mash-evaluation.R @@ -0,0 +1,1036 @@ +# --- Get Results from Mash Object with mashr get functions --------- + +#' @title Get current date-time in a filename-appropriate format. +#' +#' @description Converts the current \code{Sys.time()} system time to a format +#' that is acceptable to include in a filename. Changes punctuation that +#' won't work in a filename. +#' +#' @return A string containing the current date-time with spaces and colons +#' replaced with underscores and periods, respectively. +#' +#' @importFrom stringr str_replace_all +get_date_filename <- function(){ + str_replace_all(str_replace_all(Sys.time(), ":", "."), " ", "_") +} + + +#' Get number of conditions +#' +#' @param m The mash result +#' +#' @importFrom ashr get_pm +get_ncond = function(m){ + return(ncol(get_pm(m))) +} + + +#' Return the Bayes Factor for each effect +#' +#' @param m the mash result (from joint or 1by1 analysis); must have been +#' computed using usepointmass = TRUE +#' +#' @return if m was fitted using usepointmass=TRUE then returns a vector of +#' the log10(bf) values for each effect. That is, the jth element +#' lbf_j is log10(Pr(Bj | g = ghat-nonnull)/Pr(Bj | g = 0)) where gha +#' t-nonnull is the non-null part of ghat. Otherwise returns NULL. +#' +get_log10bf = function(m) { + if(is.null(m$null_loglik)){ + return(NULL) + } else { + return((m$alt_loglik - m$null_loglik)/log(10)) + } +} + + +#' From a mash result, get effects that are significant in at least +#' one condition. +#' +#' @param m the mash result (from joint or 1by1 analysis) +#' @param thresh indicates the threshold below which to call signals significant +#' @param conditions which conditions to include in check (default to all) +#' @param sig_fn the significance function used to extract significance from +#' mash object; eg could be ashr::get_lfsr or ashr::get_lfdr. (Small values +#' must indicate significant.) +#' +#' @return a vector containing the indices of the significant effects, by +#' order of most significant to least +#' +#' @importFrom ashr get_lfsr +#' +#' @export +get_significant_results = function(m, thresh = 0.05, conditions = NULL, + sig_fn = ashr::get_lfsr) { + if (is.null(conditions)) { + conditions = 1:get_ncond(m) + } + top = apply(sig_fn(m)[,conditions,drop=FALSE],1,min) # find top effect + # in each condition + sig = which(top < thresh) + ord = order(top[sig],decreasing=FALSE) + sig[ord] +} + + +#' @title Get mash marker_df +#' +#' @description Pulls SNP markers information in the mash object from a bigsnp +#' object. +#' +#' @param m An object of type mash +#' @param snp A bigSNP object, produced by the bigsnpr package. Here, the WAMI +#' SNP information. +#' +#' @importFrom magrittr %>% +#' @importFrom rlang .data +#' @importFrom tibble enframe tibble +#' @importFrom dplyr arrange row_number mutate left_join +#' +get_marker_df <- function(m, snp){ + if(attr(snp, "class") != "bigSNP"){ + stop("snp needs to be a bigSNP object, produced by the bigsnpr package.") + } + markers <- tibble(CHR = snp$map$chromosome, POS = snp$map$physical.pos, + marker.ID = snp$map$marker.ID) %>% + mutate(CHRN = as.numeric(as.factor(.data$CHR)), + CHR = as.factor(.data$CHR), + value = row_number()) + marker_df <- get_significant_results(m, thresh = 1) %>% + enframe(name = NULL, value = "value") %>% + arrange(.data$value) %>% + left_join(markers, by = "value") + + return(marker_df) +} + +#' @title Return the estimated mixture proportions. Use get_estimated_pi to +#' extract the estimates of the mixture proportions for different types of +#' covariance matrix. This tells you which covariance matrices have most of +#' the mass. +#' +#' @param m the mash result +#' @param dimension indicates whether you want the mixture proportions for the +#' covariances, grid, or all +#' +#' @return a named vector containing the estimated mixture proportions. +#' +#' @details If the fit was done with `usepointmass=TRUE` then the first +#' element of the returned vector will correspond to the null, and the +#' remaining elements to the non-null covariance matrices. Suppose the fit +#' was done with $K$ covariances and a grid of length $L$. If +#' `dimension=cov` then the returned vector will be of length $K$ +#' (or $K+1$ if `usepointmass=TRUE`). If `dimension=grid` then +#' the returned vector will be of length $L$ (or $L+1$). If +#' `dimension=all` then the returned vector will be of length $LK$ (or +#' $LK+1$). The names of the vector will be informative for which +#' combination each element corresponds to. +#' +#' @importFrom ashr get_fitted_g +#' +get_estimated_pi = function(m, dimension = c("cov","grid","all")){ + dimension = match.arg(dimension) + if(dimension=="all"){ + get_estimated_pi_no_collapse(m) + } else { + g = get_fitted_g(m) + pihat = g$pi + pihat_names = NULL + pi_null = NULL + + if(g$usepointmass){ + pihat_names=c("null",pihat_names) + pi_null = pihat[1] + pihat = pihat[-1] + } + + pihat = matrix(pihat,nrow=length(g$Ulist)) + if(dimension=="cov"){ + pihat = rowSums(pihat) + pihat_names = c(pihat_names,names(g$Ulist)) + } else if(dimension=="grid"){ + pihat = colSums(pihat) + pihat_names = c(pihat_names,1:length(g$grid)) + } + + pihat = c(pi_null,pihat) + names(pihat) = pihat_names + return(pihat) + } +} + +get_estimated_pi_no_collapse = function(m){ + g = get_fitted_g(m) + pihat = g$pi + names(pihat) = names(expand_cov(g$Ulist, g$grid, g$usepointmass)) + pihat +} + +#' @title Create expanded list of covariance matrices expanded by +#' grid, Sigma_{lk} = omega_l U_k +#' +#' @description This is an internal (non-exported) function. This help +#' page provides additional documentation mainly intended for +#' developers and expert users. +#' +#' @param Ulist a list of covarance matrices +#' +#' @param grid a grid of scalar values by which the covariance +#' matrices are to be sc +#' +#' @param usepointmass if TRUE adds a point mass at 0 (null component) +#' to the list +#' +#' @return This takes the covariance matrices in Ulist and multiplies +#' them by the grid values If usepointmass is TRUE then it adds a null +#' component. +#' +#' @keywords internal +#' +expand_cov = function(Ulist, grid, usepointmass = TRUE){ + scaled_Ulist = scale_cov(Ulist, grid) + R = nrow(Ulist[[1]]) + if(usepointmass){ + scaled_Ulist = c(list(null=matrix(0,nrow=R,ncol=R)),scaled_Ulist) + } + return(scaled_Ulist) +} + +#' @title Scale each covariance matrix in list Ulist by a scalar in +#' vector grid +#' +#' @description This is an internal (non-exported) function. This help +#' page provides additional documentation mainly intended for +#' developers and expert users. +#' +#' @param Ulist a list of matrices +#' +#' @param grid a vector of scaling factors (standard deviaions) +#' +#' @return a list with length length(Ulist)*length(grid) +#' +#' @keywords internal +#' +scale_cov = function(Ulist, grid){ + orig_names = names(Ulist) + Ulist = unlist( lapply(grid^2, function(x){multiply_list(Ulist,x)}), + recursive = FALSE) + names(Ulist) = unlist( lapply(1:length(grid), + function(x){paste0(orig_names,".",x)}), + recursive = FALSE) + return(Ulist) +} + +# Multiply each element of a list by scalar. (In our application each +# element of the list is a matrix.) +multiply_list = function(Ulist, x){lapply(Ulist, function(U){x*U})} + + +#' @title Get column names from a mash object +#' +#' @description This function extracts the column names from the local false +#' sign rate table of a mash object's results. This can tell you the condition +#' names or phenotype names used in the mash object. That can be useful for +#' looking at a subset of these columns, say. +#' +#' @param m An object of type mash +#' +#' @return A vector of phenotype names +#' +#' @examples +#' \dontrun{get_colnames(m = mash_obj)} +#' +get_colnames <- function(m){ + column_names <- colnames(m$result$lfsr) + return(column_names) +} + +get_Ulist <- function(m){ + Ulist <- m$fitted_g$Ulist + return(Ulist) +} + +#' Count number of conditions each effect is significant in +#' +#' @param m the mash result (from joint or 1by1 analysis) +#' @param thresh indicates the threshold below which to call signals significant +#' @param conditions which conditions to include in check (default to all) +#' @param sig_fn the significance function used to extract significance from mash object; eg could be ashr::get_lfsr or ashr::get_lfdr +#' +#' @return a vector containing the number of significant conditions +#' +get_n_significant_conditions = function(m, thresh = 0.05, conditions = NULL, + sig_fn = get_lfsr){ + if (is.null(conditions)) { + conditions = 1:get_ncond(m) + } + return(apply(sig_fn(m)[,conditions,drop=FALSE] < thresh, 1, sum)) +} + + +#' Compute the proportion of (significant) signals shared by magnitude in each pair of conditions, based on the poterior mean +#' +#' @param m the mash fit +#' @param factor a number between 0 and 1 - the factor within which effects are +#' considered to be shared. +#' @param lfsr_thresh the lfsr threshold for including an effect in the +#' assessment +#' @param FUN a function to be applied to the estimated effect sizes before +#' assessing sharing. The most obvious choice beside the default +#' 'FUN=identity' would be 'FUN=abs' if you want to ignore the sign of the +#' effects when assesing sharing. +#' @details For each pair of tissues, first identify the effects that are +#' significant (by lfsr factor & ratio < (1/factor)) + } + } + S[lower.tri(S, diag = FALSE)] = t(S)[lower.tri(S, diag = FALSE)] + colnames(S) = row.names(S) = colnames(m$result$PosteriorMean) + + return(S) +} + +#' From a mash result, get effects that are significant in at least one condition +#' +#' @param m the mash result (from joint or 1by1 analysis) +#' @param thresh indicates the threshold below which to call signals significant +#' @param conditions which conditions to include in check (default to all) +#' @param sig_fn the significance function used to extract significance from mash object; eg could be ashr::get_lfsr or ashr::get_lfdr. (Small values must indicate significant.) +#' +#' @return a vector containing the indices of the significant effects, by order of most significant to least +#' +#' @importFrom ashr get_lfsr +#' +#' @export +get_significant_results = function(m, thresh = 0.05, conditions = NULL, + sig_fn = ashr::get_lfsr) { + if (is.null(conditions)) { + conditions = 1:get_ncond(m) + } + top = apply(sig_fn(m)[,conditions, drop = FALSE], 1, min) # find top effect in each condition + sig = which(top < thresh) + ord = order(top[sig], decreasing = FALSE) + sig[ord] +} + +#' @title Get data frames of types of GxE from a mash object +#' +#' @description Performs set operations to determine pairwise GxE for effects +#' from a mash object. +#' +#' @param m An object of type mash +#' @param thresh Numeric. The threshold for including an effect in the assessment +#' @param factor a number between 0 and 1. The factor within which effects are +#' considered to be shared. +#' +#' @return A list containing eight data frames. Those with names that start +#' "S_" contain significant effects of different types between pairs of +#' named rows and columns. S_all_pairwise contains all significant effects; +#' NS_pairwise contains all non-significant effects. S_CN contains effects +#' significant in only one condition, and effects with a significantly +#' different magnitude (differential sensitivity). This dataframe is not +#' conservative using the local false sign rate test - we can't determine +#' the sign of one of the effects for effects significant in only one +#' condition - so it's not recommended to use this, but included. S_2_no +#' contains effects significant in both conditions that do not differ +#' significantly in magnitude. These effects do not have GxE. S_AP contains +#' effects significant in both conditions that differ in their sign - and +#' have antagonistic pleiotropy. S_DS contains effects significant in both +#' conditions that differ in the magnitude of their effect, but not their +#' sign - differentially sensitive alleles. S_1_row and S_1_col contain +#' effects that are significant in just one of the two conditions - the row +#' or the column, respectively. +#' +#' @importFrom dplyr between mutate filter +#' @importFrom tibble enframe +#' @importFrom magrittr %>% +#' @importFrom rlang .data +#' +#' @export +get_GxE = function(m, factor = 0.4, thresh = 0.05){ + R = get_ncond(m) # Effects to consider + + S_all = matrix(NA, nrow = R, ncol = R, dimnames = list(get_colnames(m), + get_colnames(m))) + S_2_no = matrix(NA, nrow = R, ncol = R, dimnames = list(get_colnames(m), + get_colnames(m))) + S_CN = matrix(NA, nrow = R, ncol = R, dimnames = list(get_colnames(m), + get_colnames(m))) + S_AP = matrix(NA, nrow = R, ncol = R, dimnames = list(get_colnames(m), + get_colnames(m))) + S_DS = matrix(NA, nrow = R, ncol = R, dimnames = list(get_colnames(m), + get_colnames(m))) + NS_pair = matrix(NA, nrow = R, ncol = R, dimnames = list(get_colnames(m), + get_colnames(m))) + S_i = matrix(NA, nrow = R, ncol = R, dimnames = list(get_colnames(m), + get_colnames(m))) + S_j = matrix(NA, nrow = R, ncol = R, dimnames = list(get_colnames(m), + get_colnames(m))) + + for(i in 1:R){ + for(j in 1:R){ + + if(i == j){ + S_all[i, j] = length(get_significant_results(m, thresh = thresh, + conditions = i)) + S_CN[i, j] = 0 + # Not conservative!! + S_2_no[i, j] = length(get_significant_results(m, thresh = thresh, + conditions = i)) + S_AP[i, j] = 0 + S_DS[i, j] = 0 + + sig_i = get_significant_results(m, thresh = thresh, conditions = i) + all_i = get_significant_results(m, thresh = 1, conditions = i) + ns_i = dplyr::setdiff(all_i, sig_i) # effects that aren't sig in i + NS_pair[i, j] = length(ns_i) + S_i[i, j] = 0 + S_j[i, j] = 0 + + } else { + + sig_i = get_significant_results(m, thresh = thresh, conditions = i) + sig_j = get_significant_results(m, thresh = thresh, conditions = j) + + all_i = get_significant_results(m, thresh = 1, conditions = i) + all_j = get_significant_results(m, thresh = 1, conditions = j) + + ns_i = setdiff(all_i, sig_i) # effects that aren't sig in i + ns_j = setdiff(all_j, sig_j) # effects that aren't sig in j + + # Markers where we aren't sure of the sign in either condition + # aka most of the effects + NS_pair[i,j] <- length(intersect(ns_i, ns_j)) + + # Markers where we are sure of the sign in just one condition + + # Markers significant in i but not in j + ms_isigi = intersect(sig_i, ns_j) + + # Markers significant in j but not in i + ms_jsigj = intersect(sig_j, ns_i) + + # Markers where we are sure of the sign in both conditions + effects_df <- get_pm(m)[union(sig_i, sig_j), i] %>% + enframe(name = "Marker", value = "Effect_i") %>% + mutate(Effect_j = get_pm(m)[union(sig_i, sig_j), j], + ratio = .data$Effect_i/.data$Effect_j, ##divide effect sizes, if this ratio is positive there is not AP + APratio = .data$Effect_i/-.data$Effect_j) ##divide effect sizes, if this ratio is positive there is AP + + ## GxE: we are sure of the sign for two effects, and they are the same sign + # No GxE in this pair - effects are same sign and same mag + ms_sig2_noGxE <- effects_df %>% + filter(between(.data$ratio, factor, 1/factor)) + + # DS: we are sure of the sign for two effects, and they are the same sign + ms_sig2_DS <- effects_df %>% + filter(.data$ratio > 0 & !between(.data$ratio, factor, 1/factor)) + + ## GxE we are sure of the sign for two effects, and they are opposite + # AP: we are sure of the sign for two effects, and they are opposite + ms_sig2_AP <- effects_df %>% + filter(between(.data$APratio, 0, 1E10)) + + S_all[i, j] = sum(length(ms_isigi), length(ms_jsigj), nrow(ms_sig2_noGxE), + nrow(ms_sig2_DS), nrow(ms_sig2_AP)) + S_CN[i, j] = sum(length(ms_isigi), length(ms_jsigj), nrow(ms_sig2_DS)) + # Not conservative!! + S_2_no[i, j] = sum(nrow(ms_sig2_noGxE)) + S_AP[i, j] = sum(nrow(ms_sig2_AP)) + S_DS[i, j] = sum(nrow(ms_sig2_DS)) + S_i[i, j] = length(ms_isigi) + S_j[i, j] = length(ms_jsigj) + } + } + } + return(list(S_all_pairwise = S_all, S_CN = S_CN, S_2_no = S_2_no, S_AP = S_AP, + S_DS = S_DS, NS_pairwise = NS_pair, S_1_row = S_i, + S_1_col = S_j)) +} + +#' @title Get the positions of objects in a mash object Ulist that are above +#' some mass threshold. +#' +#' @description Get the positions of objects in a mash object Ulist that are +#' above some mass threshold. +#' +#' @param m An object of type mash +#' @param thresh Numeric. The mass threshold for including a covariance matrix +#' +#' @export +get_U_by_mass <- function(m, thresh = 0.05){ + umass <- mash_plot_covar(m, saveoutput = FALSE) + mass_thresh <- + umass$covar_df$`Covariance Matrix`[which(umass$covar_df$Mass >= thresh)] + Ulist_order <- names(get_Ulist(m)) + range <- which(Ulist_order %in% mass_thresh) + return(range) +} + + +#' @title Reorder correlation matrix +#' +#' @description Reorder correlation coefficients from a matrix of things +#' (including NA's) and hierarchically cluster them +#' +#' @param cormat A correlation matrix +#' +#' @importFrom cluster daisy +#' @importFrom stats hclust +#' +reorder_cormat <- function(cormat){ + # Use correlation between variables as distance + dd <- daisy(cormat, metric = "gower") + hc <- hclust(dd) + cormat <- cormat[hc$order, hc$order] +} + + +# --- Plot & Save Plots --------- + +#' ggplot of single mash effect +#' +#' @description Creates a plot with point estimates and standard errors for +#' effects of a single SNP in multiple conditions. +#' +#' @param m An object of type mash +#' @param snp A bigSNP object, produced by the bigsnpr package. Here, +#' the WAMI SNP information. +#' @param n Optional. Integer or integer vector. The result number to plot, in +#' order of significance. 1 would be the top result, for example. Find +#' these with \code{\link{get_significant_results}}. +#' @param i Optional. Integer or integer vector. The result number to plot, in +#' the order of the mash object. 1 would be the first marker in the mash +#' object, for example. Find these with \code{\link{get_marker_df}}. +#' @param marker Optional. Print the marker name on the plot? +#' @param saveoutput Logical. Should the output be saved to the path? +#' @param suffix Character. Optional. A unique suffix used to save the files, +#' instead of the current date & time. +#' +#' @note Specify only one of n or i. +#' +#' @importFrom ashr get_psd +#' @importFrom cowplot save_plot +#' @importFrom tibble enframe +#' @importFrom dplyr mutate case_when +#' @importFrom tidyr separate +#' @import ggplot2 +#' @importFrom purrr as_vector +#' @importFrom stringr str_replace str_replace_all str_sub +#' +#' @export +mash_plot_marker_effect <- function(m, snp = snp, n = NA, i = NA, marker = TRUE, + saveoutput = FALSE, suffix = ""){ + stopifnot((typeof(n) %in% c("double", "integer") | typeof(i) %in% c("double", + "integer"))) + if(attr(snp, "class") != "bigSNP"){ + stop("snp needs to be a bigSNP object, produced by the bigsnpr package.") + } + if(typeof(n) != "logical"){ + i <- get_significant_results(m)[n] + marker_df <- tibble(CHR = snp$map$chromosome[i], + POS = snp$map$physical.pos[i], + marker.ID = snp$map$marker.ID[i]) + marker_name <- marker_df$marker.ID + } else { + marker_df <- get_marker_df(m, snp)[i,] + marker_name <- marker_df$marker.ID + } + effectplot <- get_colnames(m) %>% + enframe(name = NULL, value = "Conditions") %>% + mutate(mn = get_pm(m)[i,], + se = get_psd(m)[i,]) %>% + mutate(Conditions = str_replace(.data$Conditions, + "(Stand_)??Bhat_?", + ""), + Conditions = str_replace(.data$Conditions, "-mean$", "") + ) + + ggobject <- ggplot(data = effectplot) + + geom_point(mapping = aes(x = as.factor(.data$Conditions), y = .data$mn)) + + geom_errorbar(mapping = aes(ymin = .data$mn - .data$se, + ymax = .data$mn + .data$se, + x = .data$Conditions), width = 0.3) + + geom_hline(yintercept = 0, lty = 2) + + labs(x = "Conditions", y = "Effect Size") + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + if(marker == TRUE){ + ggobject <- ggobject + ggtitle(label = marker_name) + } + if(saveoutput == TRUE){ + if(!(str_sub(suffix, end = 1) %in% c("", "_"))){ + suffix <- paste0("_", suffix) + } + if(str_sub(suffix, end = 1) %in% c("")){ + suffix <- paste0("_", get_date_filename()) + } + save_plot(filename = paste0("Effect_plot_", str_replace_all(marker_name, + " ", "_"), + suffix, ".png"), + plot = ggobject, base_aspect_ratio = nrow(effectplot)*0.1 + 0.1, + base_height = 3.5) + } + return(list(marker = marker_name, effect_df = effectplot, + ggobject = ggobject)) +} + +#' ggplot of covariance matrix masses +#' +#' @description Creates a bar plot using ggplot of the masses that are on each +#' covariance matrix specified in the mash model. +#' +#' @param m An object of type mash +#' @param saveoutput Logical. Should the output be saved to the path? +#' @param suffix Character. Optional. A unique suffix used to save the files, +#' instead of the current date & time. +#' +#' @importFrom cowplot save_plot +#' @importFrom tibble enframe +#' @importFrom dplyr mutate arrange desc +#' @importFrom stringr str_replace str_sub +#' @import ggplot2 +#' +#' @note This plot can be useful for seeing the overall patterns of effects in +#' the data used in mash. Non-significant effects will add mass to the +#' "no_effects" covariance matrix, while significant effects will add mass +#' to one of the other covariance matrices. You can use mash_plot_Ulist() +#' to plot the covariance matrix patterns themselves. +#' +#' @export +mash_plot_covar <- function(m, saveoutput = FALSE, suffix = ""){ + df <- get_estimated_pi(m) + df <- enframe(df, name = "Covariance Matrix", value = "Mass") %>% + mutate(`Covariance Matrix` = str_replace(.data$`Covariance Matrix`, + "(Stand_)??Bhat_?", + "single_effect_"), + `Covariance Matrix` = str_replace(.data$`Covariance Matrix`, + "^null$", "no_effects")) %>% + arrange(desc(.data$`Covariance Matrix`)) + df$`Covariance Matrix` <- factor(df$`Covariance Matrix`, + levels = (df$`Covariance Matrix`)) + ggobject <- ggplot(df) + + geom_bar(aes(x = .data$Mass, y = .data$`Covariance Matrix`), + stat = "identity") + + if(saveoutput == TRUE){ + if(!(str_sub(suffix, end = 1) %in% c("", "_"))){ + suffix <- paste0("_", suffix) + } + if(str_sub(suffix, end = 1) %in% c("")){ + suffix <- paste0("_", get_date_filename()) + } + save_plot(paste0("Covariance_matrix_mass_plot", suffix, + ".png"), plot = ggobject, base_aspect_ratio = 1, + base_height = nrow(df)*.15) + } + return(list(covar_df = df, ggobject = ggobject)) +} + +#' ggplot of specific covariance matrix patterns +#' +#' @description Creates a tile plot using ggplot of the covariance matrices +#' specified in the mash model. +#' +#' @param m An object of type mash +#' @param range Numeric vector. Which covariance matrices should be plotted? +#' @param saveoutput Logical. Should the output be saved to the path? +#' @param suffix Character. Optional. A unique suffix used to save the files, +#' instead of the current date & time. +#' @param limits Logical. Should there be plot limits of -1 and 1? Default is TRUE. +#' @param labels Logical. Should each pairwise comparison proportion have a label? Default is TRUE. +#' +#' @return A list of dataframes used to make the tile plots and the plots +#' themselves. +#' +#' @importFrom cowplot save_plot +#' @importFrom tibble enframe as_tibble +#' @importFrom dplyr mutate filter +#' @importFrom rlist list.append +#' @importFrom rlang .data +#' @importFrom stringr str_replace str_sub +#' @importFrom tidyr pivot_longer +#' @import ggplot2 +#' +#' @export +mash_plot_Ulist <- function(m, range = NA, saveoutput = FALSE, suffix = "", + limits = TRUE, labels = TRUE){ + Ulist <- get_Ulist(m) + Ureturn <- list() + stopifnot(typeof(range) %in% c("double", "integer", "logical")) + if(typeof(range) == "logical"){ + range <- seq_along(Ulist) + } + + for(u in range){ + U1 <- Ulist[[u]] + # Remove half the tiles if the matrix is symmetric + if(isSymmetric(U1)){ + for(i in 1:nrow(U1)){ + for(j in 1:ncol(U1)){ + if(i < j){ + U1[i, j] <- NA + } + } + } + } + if(ncol(U1) == length(get_colnames(m))){ + colnames(U1) <- str_replace(get_colnames(m), "(Stand_)??Bhat_?", "") %>% + str_replace("-mean$", "") + U1 <- as_tibble(U1, .name_repair = "unique") %>% + mutate(rowU = str_replace(get_colnames(m), "(Stand_)??Bhat_?", ""), + rowU = str_replace(.data$rowU, "-mean$", "")) %>% + pivot_longer(cols = -.data$rowU, names_to = "colU", + values_to = "covar") %>% + filter(!is.na(.data$covar)) + } else { + U1 <- as_tibble(U1, rownames = "rowU", .name_repair = "unique") %>% + pivot_longer(cols = -.data$rowU, names_to = "colU", + values_to = "covar") %>% + mutate(rowU = paste0("Condition", .data$rowU), + colU = str_replace(.data$colU, "\\.\\.\\.", "Condition")) %>% + filter(!is.na(.data$covar)) + } + + if(limits == TRUE){ + U1_covar <- U1 %>% + ggplot(aes(x = .data$rowU, y = .data$colU)) + + geom_tile(aes(fill = .data$covar), na.rm = TRUE) + + scale_fill_gradientn(colors = c("#440154FF", "#3B528BFF", "#2C728EFF", + "white", "#27AD81FF", "#5DC863FF", + "#FDE725FF"), limits = c(-1,1)) + + #geom_text(aes(label = round(.data$covar, 1)), color = "darkgrey") + + theme(legend.position = "right", + axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) + + xlab("") + ylab("") + ggtitle(names(Ulist)[u]) + } else { + lim_lower <- min(U1$covar) + lim_upper <- max(U1$covar) + max(U1$covar) * 0.01 + U1_covar <- U1 %>% + ggplot(aes(x = .data$rowU, y = .data$colU)) + + geom_tile(aes(fill = .data$covar), na.rm = TRUE) + + scale_fill_gradientn(colors = c("#440154FF", "#472D7BFF", "#3B528BFF", + "#2C728EFF", "#21908CFF", "#27AD81FF", + "#5DC863FF", "#AADC32FF", "#FDE725FF"), + limits = c(lim_lower, lim_upper)) + + #geom_text(aes(label = round(.data$covar, 1)), color = "darkgrey") + + theme(legend.position = "right", + axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) + + xlab("") + ylab("") + ggtitle(names(Ulist)[u]) + } + if (labels == TRUE) { + U1_covar <- U1_covar + + geom_text(aes(label = round(.data$covar, 1)), color = "darkgrey") + } + + Ureturn <- list.append(Ureturn, U1 = U1, + ggobject = U1_covar) + names(Ureturn)[-2] <- paste0(names(Ulist)[u], "_df") + names(Ureturn)[-1] <- paste0(names(Ulist)[u], "_ggobject") + + if(saveoutput == TRUE){ + if(!(str_sub(suffix, end = 1) %in% c("", "_"))){ + suffix <- paste0("_", suffix) + } + if(str_sub(suffix, end = 1) %in% c("")){ + suffix <- paste0("_", get_date_filename()) + } + save_plot(paste0("Covariances_plot_", names(Ulist)[u], suffix, ".png"), + plot = U1_covar, + base_height = length(get_colnames(m))*.18 + 1.2, + base_asp = 1.35) + } + + } + return(Ureturn) +} + +#' @title Manhattan plot in ggplot colored by significant conditions +#' +#' @description Takes a mash object and, for some vector of phenotypes, returns +#' a Manhattan plot ggplot object (and its dataframe). Each SNP in the plot +#' is colored by the number of phenotypes it is significant for. Even and +#' odd chromosomes have different shapes for their SNPs, so that +#' chromosome identity can be determined. +#' +#' @param m A mash object (outputted by mash). +#' @param snp A bigSNP object, produced by the bigsnpr package. Here, +#' the WAMI SNP information. +#' @param cond A vector of phenotypes. Defaults to the names of each +#' column in the mash object. +#' @param saveoutput Logical. Should the output be saved to the path? +#' @param suffix Character. Optional. A unique suffix used to save the files, +#' instead of the current date & time. +#' @param thresh Numeric. The threshold used for the local false sign rate to +#' call significance in a condition. +#' +#' @return A \code{tbl_df()} of the data used to make the Manhattan plot, and a +#' ggplot object containing the Manhattan. +#' +#' @importFrom cowplot save_plot +#' @importFrom dplyr rename select arrange mutate left_join +#' @import ggplot2 +#' @importFrom tibble as_tibble rownames_to_column enframe +#' @importFrom tidyr separate +#' @importFrom stringr str_sub +#' +#' @examples +#' \dontrun{manhattan_out <- mash_ggman_by_condition(m = m, saveoutput = TRUE)} +#' +#' @export +mash_plot_manhattan_by_condition <- function(m, snp, cond = NA, + saveoutput = FALSE, + suffix = "", thresh = 0.05){ + if(attr(snp, "class") != "bigSNP"){ + stop("snp needs to be a bigSNP object, produced by the bigsnpr package.") + } + num_sig_in_cond <- c() + + if(is.na(cond)[1]){ + cond <- get_colnames(m = m) + } + + log10bf_df <- get_log10bf(m = m) %>% + as.data.frame() %>% + rownames_to_column(var = "value") %>% + mutate(value = as.integer(.data$value)) %>% + as_tibble() %>% + left_join(get_marker_df(m = m, snp = snp), by = "value") %>% + dplyr::rename(log10BayesFactor = .data$V1) %>% + dplyr::select(-.data$value) %>% + mutate(Num_Sig_Conditions = get_n_significant_conditions(m = m, + thresh = thresh, + conditions = cond) ) + + log10BF <- expression(paste("log"[10], plain("(Bayes Factor)"))) + + ggmanobject <- ggplot(data = log10bf_df, aes(x = .data$POS, + y = .data$log10BayesFactor)) + + geom_point(aes(color = .data$Num_Sig_Conditions, fill = .data$Num_Sig_Conditions, + shape = as.factor(.data$CHR))) + + facet_wrap(~ .data$CHR, nrow = 1, scales = "free_x", strip.position = "bottom") + + scale_color_viridis_c(option = "B", end = 0.95) + + scale_fill_viridis_c(option = "B", end = 0.95) + + theme(strip.text = element_text(size = 8), + axis.text.y = element_text(size = 8), + axis.text.x = element_blank(), + legend.text = element_text(size = 8), + axis.ticks.x = element_blank(), + panel.background = element_rect(fill=NA), + legend.position = "right") + + labs(x = "Chromosome", y = log10BF) + + scale_x_continuous(expand = c(0.32, 0.32)) + + scale_shape_manual(values = rep(c(21, 22, 23), 9), guide = FALSE) + + if(saveoutput == TRUE){ + if(!(str_sub(suffix, end = 1) %in% c("", "_"))){ + suffix <- paste0("_", suffix) + } + if(str_sub(suffix, end = 1) %in% c("")){ + suffix <- paste0("_", get_date_filename()) + } + save_plot(paste0("Manhattan_mash", suffix, + ".png"), plot = ggmanobject, base_aspect_ratio = 2.6, + base_height = 3.3) + } + + return(list(ggman_df = log10bf_df, ggmanobject = ggmanobject)) +} + + +#' @title Create a ggplot of pairwise sharing of mash effects +#' +#' @description Given a correlation matrix, an RDS with a correlation matrix, or +#' a mash object, create a ggplot of pairwise sharing of mash effects using +#' \code{\link{get_pairwise_sharing}} and \code{\link{ggcorr}}. +#' +#' @param m An object of type mash +#' @param effectRDS An RDS containing a correlation matrix. +#' @param corrmatrix A correlation matrix +#' @param reorder Logical. Should the columns be reordered by similarity? +#' @param saveoutput Logical. Should the output be saved to the path? +#' @param suffix Character. Optional. A unique suffix used to save the files, +#' instead of the current date & time. +#' @param filename Character string with an output filename. Optional. +#' @param ... Other arguments to \code{\link{get_pairwise_sharing}} or +#' \code{\link{ggcorr}}. +#' +#' @importFrom GGally ggcorr +#' @importFrom stringr str_replace_all str_sub +#' @importFrom rlang .data +#' +#' @return A list containing a dataframe containing the correlations and a +#' ggplot2 object containing the correlation plot. +#' +#' @export +mash_plot_pairwise_sharing <- function(m = NULL, effectRDS = NULL, + corrmatrix = NULL, reorder = TRUE, + saveoutput = FALSE, filename = NA, + suffix = "", ...){ + # Additional arguments for get_pairwise_sharing, ggcorr, and save_plot + requireNamespace("dots") + factor <- dots::dots(name = 'factor', value = 0.5, ...) + lfsr_thresh <- dots::dots(name = 'lfsr_thresh', value = 0.05, ...) + FUN <- dots::dots(name = 'FUN', value = identity, ...) + geom <- dots::dots(name = 'geom', value = 'circle', ...) + label <- dots::dots(name = 'label', value = FALSE, ...) + label_alpha <- dots::dots(name = 'label_alpha', value = TRUE, ...) + label_size <- dots::dots(name = 'label_size', value = 3, ...) + hjust <- dots::dots(name = 'hjust', value = 0.95, ...) + vjust <- dots::dots(name = 'vjust', value = 0.3, ...) + layout.exp <- dots::dots(name = 'layout.exp', value = 9, ...) + min_size <- dots::dots(name = 'min_size', value = 0, ...) + max_size <- dots::dots(name = 'max_size', value = 3.5, ...) + option <- dots::dots(name = 'option', value = 'B', ...) + dpi <- dots::dots(name = 'dpi', value = 500, ...) + + base_aspect_ratio <- dots::dots(name = 'base_aspect_ratio', value = 1.1, ...) + + if(is.na(filename)[1]){ + if(!(str_sub(suffix, end = 1) %in% c("", "_"))){ + suffix <- paste0("_", suffix) + } + if(str_sub(suffix, end = 1) %in% c("")){ + suffix <- paste0("_", get_date_filename()) + } + filename <- paste0("Mash_pairwise_shared_effects", suffix, ".png") + } + + # look for a shared effects matrix in the path, and if not, generate one + if(!is.null(effectRDS) && is.null(m) && is.null(corrmatrix)){ + shared_effects <- readRDS(effectRDS) + } else if(!is.null(corrmatrix) && is.null(effectRDS) && is.null(m)){ + shared_effects <- corrmatrix + } else if(!is.null(m)){ + shared_effects <- get_pairwise_sharing(m = m, factor = factor, + lfsr_thresh = lfsr_thresh, FUN = FUN) + rownames(shared_effects) <- str_replace_all(rownames(shared_effects), + "(Stand_)??Bhat_?", "") %>% + str_replace("-mean$", "") + colnames(shared_effects) <- str_replace_all(colnames(shared_effects), + "(Stand_)??Bhat_?", "") %>% + str_replace("-mean$", "") + } else { + stop(paste0("Please specify one of these: ", + "1. a mash output object (m), ", + "2. the path to a effect rds file (mashRDS), ", + "3. a correlation matrix (corrmatrix).")) + } + + base_height <- dots::dots(name = 'base_height', + value = nrow(shared_effects)*0.33+1, ...) + + if(reorder == TRUE){ + corrdf <- reorder_cormat(cormat = shared_effects) + corrplot <- ggcorr(data = NULL, cor_matrix = corrdf, geom = geom, + label = label, label_alpha = label_alpha, + label_size = label_size, hjust = hjust, vjust = vjust, + layout.exp = layout.exp, min_size = min_size, + max_size = max_size) + + scale_color_viridis_c(option = option) + } else { + corrplot <- ggcorr(data = NULL, cor_matrix = shared_effects, geom = geom, + label = label, label_alpha = label_alpha, + label_size = label_size, hjust = hjust, vjust = vjust, + layout.exp = layout.exp, min_size = min_size, + max_size = max_size) + + scale_color_viridis_c(option = option) + } + + if(saveoutput == TRUE){ + save_plot(filename = filename, corrplot, + base_aspect_ratio = base_aspect_ratio, base_height = base_height, + dpi = dpi) + } + return(list(corr_matrix = shared_effects, gg_corr = corrplot)) +} + +#' @title Significant SNPs per number of conditions +#' +#' @description For some number of columns in a mash object that correspond to +#' conditions, find the number of SNPs that are significant for that number +#' of conditions. +#' +#' @param m An object of type mash +#' @param conditions A vector of conditions. Get these with get_colnames(m). +#' @param saveoutput Logical. Save plot output to a file? Default is FALSE. +#' @param thresh What is the threshold to call an effect significant? Default +#' is 0.05. +#' @param suffix Character. Optional. A unique suffix used to save the files, +#' instead of the current date & time. +#' +#' @return A list containing a dataframe of the number of SNPs significant per +#' number of conditions, and a ggplot object using that dataframe. +#' +#' @import ggplot2 +#' @importFrom tibble enframe +#' @importFrom dplyr rename summarise filter group_by n +#' @importFrom stringr str_sub +#' +#' @examples +#' \dontrun{mash_plot_sig_by_condition(m = mash_obj, saveoutput = TRUE)} +#' +#' @export +mash_plot_sig_by_condition <- function(m, conditions = NA, saveoutput = FALSE, + suffix = "", thresh = 0.05){ + + thresh <- as.numeric(thresh) + num_sig_in_cond <- c() + + if(typeof(conditions) == "logical"){ + cond <- get_colnames(m) + } else { + cond <- conditions + } + + SigHist <- get_n_significant_conditions(m = m, thresh = thresh, + conditions = cond) %>% + enframe(name = "Marker") %>% + rename(Number_of_Conditions = .data$value) %>% + group_by(.data$Number_of_Conditions) %>% + summarise(Significant_SNPs = n()) %>% + filter(.data$Number_of_Conditions != 0) + + vis <- ggplot(SigHist, aes(x = .data$Number_of_Conditions, + y = .data$Significant_SNPs)) + + geom_line() + + geom_point() + + geom_hline(yintercept = 0, lty = 2) + + xlab(label = "Number of Conditions") + + ylab(label = "Number of Significant SNPs") + + if(saveoutput == TRUE){ + if(!(str_sub(suffix, end = 1) %in% c("", "_"))){ + suffix <- paste0("_", suffix) + } + if(str_sub(suffix, end = 1) %in% c("")){ + suffix <- paste0("_", get_date_filename()) + } + ggsave(paste0("SNPs_significant_by_number_of_conditions", suffix, ".png"), + width = 5, height = 3, units = "in", dpi = 400) + } + + return(list(sighist = SigHist, ggobject = vis)) +} diff --git a/man/check_covmat_basics.Rd b/man/check_covmat_basics.Rd new file mode 100644 index 0000000..9cc75e1 --- /dev/null +++ b/man/check_covmat_basics.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/wrapper.R +\name{check_covmat_basics} +\alias{check_covmat_basics} +\title{Basic sanity check for covariance matrices} +\usage{ +check_covmat_basics(x) +} +\arguments{ +\item{x}{input matrix} +} +\description{ +Basic sanity check for covariance matrices +} diff --git a/man/check_positive_definite.Rd b/man/check_positive_definite.Rd new file mode 100644 index 0000000..fb655c8 --- /dev/null +++ b/man/check_positive_definite.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/wrapper.R +\name{check_positive_definite} +\alias{check_positive_definite} +\title{check matrix for positive definitness} +\usage{ +check_positive_definite(x) +} +\arguments{ +\item{x}{input matrix} +} +\description{ +check matrix for positive definitness +} diff --git a/man/labelled_stop.Rd b/man/labelled_stop.Rd new file mode 100644 index 0000000..4484e4a --- /dev/null +++ b/man/labelled_stop.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/wrapper.R +\name{labelled_stop} +\alias{labelled_stop} +\title{A wrapper function to \code{stop} call} +\usage{ +labelled_stop(x, msg) +} +\arguments{ +\item{x}{input matrix} + +\item{msg}{Character string. A message to append to the stop call.} +} +\description{ +A wrapper function to \code{stop} call +} diff --git a/man/mash_plot_Ulist.Rd b/man/mash_plot_Ulist.Rd index 56d6e91..b35a7ad 100644 --- a/man/mash_plot_Ulist.Rd +++ b/man/mash_plot_Ulist.Rd @@ -23,7 +23,9 @@ mash_plot_Ulist( \item{suffix}{Character. Optional. A unique suffix used to save the files, instead of the current date & time.} -\item{limits}{should there be plot limits of -1 and 1? Default is true.} +\item{limits}{Logical. Should there be plot limits of -1 and 1? Default is TRUE.} + +\item{labels}{Logical. Should each pairwise comparison proportion have a label? Default is TRUE.} } \value{ A list of dataframes used to make the tile plots and the plots diff --git a/man/printf2.Rd b/man/printf2.Rd index 11e288c..fe4b96a 100644 --- a/man/printf2.Rd +++ b/man/printf2.Rd @@ -2,10 +2,15 @@ % Please edit documentation in R/wrapper.R \name{printf2} \alias{printf2} -\title{Verbose?} +\title{print message function if verbose} \usage{ printf2(verbose, ...) } -\description{ -Verbose? +\arguments{ +\item{verbose}{Logical. If TRUE, print progress messages.} + +\item{...}{Other arguments to \code{printf()}} +} +\description{ +print message function if verbose }