Error checks and plotting speedups for dive_phe2mash function

2021-03-31 18:17:02 -05:00
parent 027323acf6
commit 1edd72d15e
26 changed files with 779 additions and 63 deletions
--- a/R/wrapper.R
+++ b/R/wrapper.R
@@ -42,6 +42,7 @@
 #' @param U.hyp Other covariance matrices for mash. Specify these as a list. These
 #'     matrices must have dimensions that match the number of phenotypes where
 #'     univariate GWAS ran successfully.
+#' @param verbose Output some information on the iterations? Default is `TRUE`.
 #'
 #' @return A mash object made up of all phenotypes where univariate GWAS ran
 #'     successfully.
@@ -56,6 +57,7 @@
 #' @importFrom tidyr replace_na
 #' @importFrom matrixStats colMaxs rowMaxs
 #' @importFrom stats predict
+#' @importFrom bigassertr printf
 #'
 #' @export
 dive_phe2mash <- function(df, snp, type = "linear", svd = NULL, suffix = "",
@@ -64,7 +66,7 @@ dive_phe2mash <- function(df, snp, type = "linear", svd = NULL, suffix = "",
                          thr.m = c("sum", "max"), num.strong = 1000,
                          num.random = NA,
                          scale.phe = TRUE, roll.size = 50, U.ed = NA,
-                          U.hyp = NA){
+                          U.hyp = NA, verbose = TRUE){
  # 1. Stop if not functions. ----
  if (attr(snp, "class") != "bigSNP") {
    stop("snp needs to be a bigSNP object, produced by the bigsnpr package.")
@@ -80,6 +82,9 @@ dive_phe2mash <- function(df, snp, type = "linear", svd = NULL, suffix = "",
  } else {
    type <- rep(type, ncol(df) - 1)
  }
+  if (!dir.exists(outputdir)) {
+    dir.create(outputdir)
+  }

  ## 1a. Generate useful values ----
  G <- snp$genotypes
@@ -94,12 +99,13 @@ dive_phe2mash <- function(df, snp, type = "linear", svd = NULL, suffix = "",
  bonferroni <- -log10(0.05/length(snp$map$physical.pos))
  markers <- tibble(CHR = snp$map$chromosome, POS = snp$map$physical.pos,
                    marker.ID = snp$map$marker.ID) %>%
-    mutate(CHRN = as.numeric(as.factor(.data$CHR)))
+    mutate(CHRN = as.numeric(as.factor(.data$CHR)),
+           CHR = as.factor(.data$CHR))

  # 2. Pop Structure Correction  ----
  if (is.null(svd)) {
-    message(paste0("Covariance matrix (svd) was not supplied - this will be",
-                   " generated using snp_autoSVD()."))
+    printf2(verbose = verbose, "\nCovariance matrix (svd) was not supplied - ")
+    printf2(verbose = verbose, "\nthis will be generated using snp_autoSVD()")
    svd <- snp_autoSVD(G = G, infos.chr = markers$CHRN, infos.pos = markers$POS,
                       k = 10, thr.r2 = thr.r2, roll.size = roll.size)
    } else {
@@ -107,6 +113,7 @@ dive_phe2mash <- function(df, snp, type = "linear", svd = NULL, suffix = "",
      }
  pc_max <- ncol(svd$u)
  gwas_ok <- c()
+  first_gwas_ok <- FALSE

  for (i in 2:ncol(df)) {
    df1 <- df %>%
@@ -127,8 +134,10 @@ dive_phe2mash <- function(df, snp, type = "linear", svd = NULL, suffix = "",
    gwas_ok[i-1] <- check_gwas(df1 = df1, phename = phename, type = type[i-1],
                               nPhe = nPhe, minphe = min.phe, nLev = nLev)

+
    # Find best # PCs to correct for population structure for each phenotype.
    if(gwas_ok[i-1]){
+
      lambdagc_df <- div_lambda_GC(df = df1, type = type[i-1], snp = snp,
                                   svd = svd, npcs = c(0:pc_max))
      PC_df <- get_best_PC_df(lambdagc_df)
@@ -145,29 +154,22 @@ dive_phe2mash <- function(df, snp, type = "linear", svd = NULL, suffix = "",
                        nsnp = nSNP, npcs = PC_df$NumPCs, nphe = nPhe,
                        nlev = nLev, lambda_GC = PC_df$lambda_GC,
                        bonferroni = bonferroni)
-    # plot Manhattan and QQ if save.plots == TRUE
-    if(save.plots == TRUE){
+
+    # plot  QQ if save.plots == TRUE
+    if (save.plots == TRUE) {
      qqplot <- get_qqplot(ps = gwas$pvalue, lambdaGC = TRUE)
-      manhattan <- get_manhattan(log10p = gwas$log10p, snp = snp,
-                                 thresh = bonferroni)  # could round these too
-      plotname <- paste0(gwas_data$phe, "_", gwas_data$type, "_model_",
-                         gwas_data$nphe, "g_", gwas_data$nsnp, "_SNPs_",
-                         gwas_data$npcs, "_PCs.png")
-      save_plot(filename = file.path(outputdir, paste0("QQplot_", plotname)),
-                plot = qqplot, base_asp = 1, base_height = 4)
-      save_plot(filename = file.path(outputdir, paste0("Manhattan_", plotname)),
-                plot = manhattan, base_asp = 2.1, base_height = 3.5)
-    }
+      }

    # save gwas outputs together in a fbm
    gwas <- gwas %>%
      select(.data[["estim"]], .data[["std.err"]], .data[["log10p"]])

-    if(i == 2){     # save .bk and .rds file the first time through the loop.
+    if(!first_gwas_ok){     # save .bk and .rds file the first time through the loop.
      if (!grepl("_$", suffix) & suffix != ""){
        suffix <- paste0("_", suffix)
      }
-      fbm.name <- paste0(outputdir, "gwas_effects", suffix)
+      first_gwas_ok <- TRUE
+      fbm.name <- file.path(outputdir, paste0("gwas_effects", suffix))

      colnames_fbm <- c(paste0(phename, "_Effect"), paste0(phename, "_SE"),
                       paste0(phename, "_log10p"))
@@ -179,18 +181,43 @@ dive_phe2mash <- function(df, snp, type = "linear", svd = NULL, suffix = "",
      colnames_fbm <- c(colnames_fbm, paste0(phename, "_Effect"),
                        paste0(phename, "_SE"), paste0(phename, "_log10p"))
      gwas2$add_columns(ncol_add = 3)
-      gwas2[,c(i*3-5, i*3-4, i*3-3)] <- gwas
+      gwas2[, c(sum(gwas_ok)*3 - 2, sum(gwas_ok)*3 - 1,
+                sum(gwas_ok)*3)] <- gwas
      gwas2$save()
-      gwas_metadata <- add_row(gwas_metadata, phe = phename, type = type[i-1],
+      gwas_metadata <- add_row(gwas_metadata, phe = phename, type = type[i - 1],
                           nsnp = nSNP, npcs = PC_df$NumPCs, nphe = nPhe,
                           nlev = nLev, lambda_GC = PC_df$lambda_GC,
                           bonferroni = bonferroni)
    }
+    # plot Manhattan and QQ if save.plots == TRUE
+    if (save.plots == TRUE) {
+      # set aspect ratio based on number of SNPs in snp file
+      asp <- log10(snp$genotypes$ncol)/2
+      if(asp < 1.1){
+        asp <- 1.1
+      }
+
+      manhattan <- get_manhattan(X = gwas2, ind = sum(gwas_ok)*3, snp = snp,
+                                 thresh = bonferroni)
+      plotname <- paste0(gwas_data$phe, "_", gwas_data$type, "_model_",
+                         gwas_data$nphe, "g_", gwas_data$nsnp, "_SNPs_",
+                         gwas_data$npcs, "_PCs.png")
+      save_plot(filename = file.path(outputdir, paste0("QQplot_", plotname)),
+                plot = qqplot, base_asp = 1, base_height = 4)
+      save_plot(filename = file.path(outputdir, paste0("Manhattan_", plotname)),
+                plot = manhattan, base_asp = asp, base_height = 3.75)
+
+    }
  rm(gwas)
-  message(paste0("Finished phenotype ", i-1, ": ", names(df)[i]))
+  printf2(verbose = verbose, "\nFinished GWAS on phenotype %s. ",
+          names(df)[i])
+    } else {
+      printf2(verbose = verbose, "\nSkipping GWAS on phenotype %s. ",
+              names(df)[i])
    }
  }

+  printf2(verbose = verbose, "\nNow preparing gwas effects for use in mash.\n")
  # 4. mash input ----
      ## prioritize effects with max(log10p) or max(sum(log10p))
      ## make a random set of relatively unlinked SNPs
@@ -198,7 +225,7 @@ dive_phe2mash <- function(df, snp, type = "linear", svd = NULL, suffix = "",
  ind_se <- (1:sum(gwas_ok))*3 - 1
  ind_p <- (1:sum(gwas_ok))*3

-  if(thr.m == "sum"){
+  if (thr.m == "sum") {
  thr_log10p <- big_apply(gwas2,
                         a.FUN = function(X, ind) rowSums(X[, ind]),
                         ind = ind_p,
@@ -211,21 +238,21 @@ dive_phe2mash <- function(df, snp, type = "linear", svd = NULL, suffix = "",
  }
  gwas2$add_columns(ncol_add = 1)
  colnames_fbm <- c(colnames_fbm, paste0(thr.m, "_thr_log10p"))
-  gwas2[,(sum(gwas_ok)*3+1)] <- thr_log10p
+  gwas2[,(sum(gwas_ok)*3 + 1)] <- thr_log10p
  gwas2$save()

    ## replace NA or Nan values
  # Replace SE with 1's, estimates and p values with 0's.
  replace_na_1 <- function(X, ind) replace_na(X[, ind], 1)
  replace_na_0 <- function(X, ind) replace_na(X[, ind], 0)
-  gwas2[,ind_se] <- big_apply(gwas2, a.FUN = replace_na_1, ind = ind_se,
+  gwas2[, ind_se] <- big_apply(gwas2, a.FUN = replace_na_1, ind = ind_se,
                              a.combine = 'plus')
-  gwas2[,ind_estim] <- big_apply(gwas2, a.FUN = replace_na_0, ind = ind_estim,
+  gwas2[, ind_estim] <- big_apply(gwas2, a.FUN = replace_na_0, ind = ind_estim,
                                 a.combine = 'plus')
-  gwas2[,ind_p] <- big_apply(gwas2, a.FUN = replace_na_0, ind = ind_p,
+  gwas2[, ind_p] <- big_apply(gwas2, a.FUN = replace_na_0, ind = ind_p,
                                 a.combine = 'plus')
-  gwas2[,(sum(gwas_ok)*3+1)] <- big_apply(gwas2, a.FUN = replace_na_0,
-                                          ind = (sum(gwas_ok)*3+1),
+  gwas2[, (sum(gwas_ok)*3+1)] <- big_apply(gwas2, a.FUN = replace_na_0,
+                                          ind = (sum(gwas_ok)*3 + 1),
                                          a.combine = 'plus')
  gwas2$save()

@@ -301,31 +328,26 @@ dive_phe2mash <- function(df, snp, type = "linear", svd = NULL, suffix = "",
  # 5. mash ----

  data_r <- mashr::mash_set_data(Bhat_random, Shat_random)
-  message(paste0("Estimating the correlation structure in the null tests from ",
-                 "the random data.
-                 (not the strong data because it will not necessarily contain
-                 any null tests)."))
+  printf2(verbose = verbose, "\nEstimating correlation structure in the null tests from a random sample of clumped data.")
  Vhat <- mashr::estimate_null_correlation_simple(data = data_r)

-  message(paste0("Setting up the main data objects with this correlation ",
-                 "structure in place."))
  data_strong <- mashr::mash_set_data(Bhat_strong, Shat_strong, V = Vhat)
  data_random <- mashr::mash_set_data(Bhat_random, Shat_random, V = Vhat)
  data_full <- mashr::mash_set_data(Bhat_full, Shat_full, V = Vhat)
  U_c <- mashr::cov_canonical(data_random)

-  if (!is.na(U.ed[1])) {
-  message(paste0("Now estimating data-driven covariances using the strong",
-                 " tests.
-                 NB: This step may take some time to complete."))
-  if (length(ind_p) < 6) {
-    cov_npc <- ind_p - 1
-  } else {
-    cov_npc <- 5
-  }
-  U_pca = mashr::cov_pca(data_strong, npc = cov_npc)
-  U_ed = mashr::cov_ed(data_strong, U_pca)
-  saveRDS(U_ed, file = paste0(outputdir, "Mash_U_ed", suffix, ".rds"))
+  if (is.na(U.ed[1])) {
+    printf2(verbose = verbose, "\nNow estimating data-driven covariances using
+    the strong tests. NB: This step may take some time to complete.\n")
+    if (length(ind_p) < 6) {
+      cov_npc <- ind_p - 1
+    } else {
+      cov_npc <- 5
+    }
+    U_pca = mashr::cov_pca(data_strong, npc = cov_npc)
+    U_ed = mashr::cov_ed(data_strong, U_pca)
+    saveRDS(U_ed, file = file.path(outputdir, paste0("Mash_U_ed", suffix,
+                                                     ".rds")))
  } else if (typeof(U.ed) == "list") {
    U_ed <- U.ed
  } else if (typeof(U.ed) == "character") {
@@ -337,12 +359,15 @@ dive_phe2mash <- function(df, snp, type = "linear", svd = NULL, suffix = "",

  if (typeof(U.hyp) == "list") {
    m = mashr::mash(data_random, Ulist = c(U_ed, U_c, U.hyp), outputlevel = 1)
-  } else {
+    } else if (typeof(U.hyp) == "character") {
+    U_hyp <- readRDS(file = U.hyp)
+    m = mashr::mash(data_random, Ulist = c(U_ed, U_c, U_hyp), outputlevel = 1)
+    } else {
    m = mashr::mash(data_random, Ulist = c(U_ed, U_c), outputlevel = 1)
+    printf2(verbose = verbose, "\nNo user-specified covariance matrices were included in the mash fit.")
  }
-  message(paste0("Compute posterior matrices for all effects",
-                 " using the mash fit from the
-               random tests."))
+  printf2(verbose = verbose, "\nComputing posterior weights for all effects
+  using the mash fit from the random tests.")
  m2 = mashr::mash(data_full, g = ashr::get_fitted_g(m), fixg = TRUE)

  return(m2)
@@ -429,6 +454,9 @@ div_gwas <- function(df, snp, type, svd, npcs){
  return(gwaspc)
 }

+#' Verbose?
+#' @importFrom bigassertr printf
+printf2 <- function(verbose, ...) if (verbose) { printf(...) }

 #' Create a quantile-quantile plot with ggplot2.
 #'
@@ -544,17 +572,27 @@ round_xy <- function(x, y, cl = NA, cu = NA, roundby = 0.001){
  }
 }

-get_manhattan <- function(log10p, snp, thresh){
+get_manhattan <- function(X, ind, snp, thresh){
+  roundFBM <- function(X, ind, at) ceiling(X[, ind] / at) * at
+  observed <- big_apply(X, ind = ind, a.FUN = roundFBM, at = 0.01,
+                        a.combine = 'plus')
+
  plot_data <- tibble(CHR = snp$map$chromosome, POS = snp$map$physical.pos,
-                    marker.ID = snp$map$marker.ID, log10p = log10p) %>%
-    mutate(observed = round2(.data$log10p, at = 0.001)) %>%
+                    marker.ID = snp$map$marker.ID, observed = observed)
+
+  if (length(unique(snp$map$physical.pos)) >= 500000) {
+    plot_data <- plot_data %>%
+      mutate(POS = round2(.data$POS, at = 250000))
+  }
+  plot_data <- plot_data %>%
    group_by(.data$CHR, .data$POS, .data$observed) %>%
-    slice(1)
+    slice(1) %>%
+    mutate(CHR = as.factor(.data$CHR))

  nchr <- length(unique(plot_data$CHR))

  p1 <- plot_data %>%
-    ggplot(aes(x = .data$POS, y = .data$log10p)) +
+    ggplot(aes(x = .data$POS, y = .data$observed)) +
    geom_point(aes(color = .data$CHR, fill = .data$CHR)) +
    geom_hline(yintercept = thresh, color = "black", linetype = 2,
               size = 1) +
@@ -583,7 +621,7 @@ get_manhattan <- function(log10p, snp, thresh){
          strip.text = element_text(hjust = 0.5, size = 10 ,vjust = 0),
          strip.placement = 'outside', panel.spacing.x = unit(-0.1, 'cm')) +
    labs(x = "Chromosome", y = "-log10(p value)") +
-    scale_x_continuous(expand = c(0.2, 0.2))
+    scale_x_continuous(expand = c(0.15, 0.15))
  return(p1)
 }

@@ -737,10 +775,9 @@ get_lambdagc <- function(ps, tol = 1e-8){
 }


-#' Return best number of PCs in terms of lambda_GC for Panicum virgatum.
-#' Return best number of PCs in terms of lambda_GC for the CDBN.
+#' Return best number of PCs in terms of lambda_GC
 #'
-#' @description Given a dataframe created using pvdiv_lambda_GC, this function
+#' @description Given a dataframe created using div_lambda_GC, this function
 #'     returns the first lambda_GC less than 1.05, or the smallest lambda_GC,
 #'     for each column in the dataframe.
 #'
@@ -812,3 +849,35 @@ check_gwas <- function(df1, phename, type, nPhe, minphe, nLev){
  }
  return(gwas_ok)
 }
+
+
+## @title Basic sanity check for covariance matrices
+## @param X input matrix
+check_covmat_basics = function(x) {
+  label = substitute(x)
+  if (!is.matrix(x))
+    labelled_stop(label, "is not a matrix")
+  if (!is.numeric(x))
+    labelled_stop(label, "is not a numeric matrix")
+  if (any(is.na(x)))
+    labelled_stop(label, "cannot contain NA values")
+  if (any(is.infinite(x)))
+    labelled_stop(label, "cannot contain Inf values")
+  if (any(is.nan(x)))
+    labelled_stop(label, "cannot contain NaN values")
+  if (nrow(x) != ncol(x))
+    labelled_stop(label, "is not a square matrix")
+  if (!isSymmetric(x, check.attributes = FALSE))
+    labelled_stop(label, "is not a symmetric matrix")
+  return(TRUE)
+}
+
+## @title check matrix for positive definitness
+## @param X input matrix
+check_positive_definite = function(x) {
+  check_covmat_basics(x)
+  tryCatch(chol(x),
+           error = function(e) labelled_stop(substitute(x),
+                                             "must be positive definite"))
+  return(TRUE)
+}