view R_functions/pipeline_code.R @ 0:64e75e21466e draft default tip

Uploaded
author pmac
date Wed, 01 Jun 2016 03:38:39 -0400
parents
children
line wrap: on
line source

### pipeline ###
# Complete a single iteration, which consists of:
# - Doing pca
# - Clustering, if required
# - Finding outliers
# - Setting up plots
# Outputs a list containing all the data regarding this iteration
single_iteration = function(outdir, basename, ped_data, xsamples, numsds,
                           cmethod, tmethod, control_tag, cases_tag, ethnicity_data=NULL) {
  it_data = list()
  it_data$old_xsamples = xsamples
  # get data and do pca
  pca_data = do_pca(ped_data)
  it_data$pca_data = pca_data
  it_data$plots = list()
  plot_number = 1
  
  # plot controls and cases
  if (!is.null(control_tag) || !is.null(cases_tag)) {
    it_data$plots[[plot_number]] = setup_cvc_plot(pca_data, control_tag, cases_tag)
    plot_number = plot_number + 1
  }
  
  # if we have ethnicity data, setup a special plot
  if (!is.null(ethnicity_data)) {
    it_data$plots[[plot_number]] = setup_ethnicity_plot(pca_data, ethnicity_data)
    plot_number = plot_number + 1
  } 
  
  if (cmethod == "none") {
    # get outliers by sd
    all_outliers = outliers_by_sd(pca_data, xsamples, numsds)
    new_xsamples = union(xsamples, pca_data$ids[all_outliers])
    it_data$xsamples = new_xsamples
    # prepare outlier plot
    it_data$plots[[plot_number]] = setup_ol_plot(pca_data, all_outliers)
    plot_number = plot_number + 1
    # prepare sd plot
    it_data$plots[[plot_number]] = setup_sd_plot(pca_data)
    plot_number = plot_number + 1
  } else {
    # do clustering
    if (cmethod == "dbscan") {
      emax = 2
      mp = 7
      clusters = automated_dbscan(pca_data, emax, mp)
    } else if (cmethod == "hclust") {
      clusters = automated_hclust(pca_data)
    } else {
      clusters = automated_hclust(pca_data)
    }
    
    # get outliers
    cluster_outliers = which(clusters == 0)
    # get rejected clusters
    centers = find_cluster_centers(clusters, pca_data$values)
    if (tmethod == "mcd") {
      rc = find_cluster_outliers_mcd(clusters, centers, pca_data, numsds, 2)
    } else if (tmethod == "sd") {
      rc = find_cluster_outliers_sd(clusters, centers, pca_data, numsds)
    } else if (tmethod == "dbscan_outliers_only") {
      rc = 0
    }
    rc_indices = which(clusters %in% rc)
    all_ol = union(cluster_outliers, rc_indices)
    # it is possible that all samples get removed, in which case program will crash.
    # so do not remove them
    if (length(all_ol) == nrow(ped_data)) {
      new_xsamples = xsamples
    } else {
      new_xsamples = union(xsamples, pca_data$ids[all_ol])
    }
    it_data$xsamples = new_xsamples
    # prepare plot
    it_data$plots[[plot_number]] = setup_cluster_plot(pca_data, clusters, rc=rc)
    plot_number = plot_number + 1
  }
  it_data$outliers = setdiff(new_xsamples, xsamples)
  it_data$num_plots = plot_number - 1
  return(it_data)
}

# basically an inner join on a list of ids, and a table of ethnicity data
# if eth_data == null, then the second column is filled with NAs
add_ethnicity_data = function(ids, eth_data) {
  n = length(ids)
  if (!is.null(eth_data)) {
    output = matrix(nrow=n, ncol=ncol(eth_data))
    colnames(output) = colnames(eth_data)
    if (n > 0) {
      for (i in 1:n) {
        this_id = ids[i]
        if (this_id %in% rownames(eth_data)) {
          this_row = unlist(lapply(eth_data[this_id, ], as.character), use.names=FALSE)
        } else {
          this_row = c(this_id, rep(NA, ncol(output)-1))
        }
        output[i, ] = this_row
      }
    }
  } else {
    output = cbind(ids, rep(NA, n))
    colnames(output) = c("IID", "Population")
  }
  return(output)
}

generate_directory_name = function(outdir, basename, iteration) {
  newdir = paste0("output_", basename, "_iteration_", iteration - 1)
  full_path = paste0(outdir, "/", newdir)
  return(full_path)
}