diff R_functions/pca_helpers.R @ 0:64e75e21466e draft default tip

Uploaded
author pmac
date Wed, 01 Jun 2016 03:38:39 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/R_functions/pca_helpers.R	Wed Jun 01 03:38:39 2016 -0400
@@ -0,0 +1,114 @@
+library(flashpcaR)
+
+# read in data from either a numeric ped file or an rds object
+# output a numeric ped file, with the rownames set to the ids of the samples
+get_source_data = function(data_source, data_type) {
+  data_type = tolower(data_type)
+  if (data_type == "numeric_ped") {
+    # read in ped file
+    ped_data = read.table(data_source, sep="\t", row.names=1)
+  } else if (data_type == "rds") {
+    hapmap3_object = readRDS(data_source)
+    ped_data = hapmap3_object$bed
+  } else if (data_type == "rdata") {
+    hapmap3_object = load_obj(data_source)
+    ped_data = hapmap3_object$bed
+  } else {
+    print("Unrecognised data type, returning NULL")
+    ped_data = NULL
+  }
+  return(ped_data)
+}
+
+# A function that will read in and return a single object from an RData file
+# This is a workaround so the program can run without needing to know name of the object;
+# however the assumption is that the RData file contains only ONE object (the one we want)
+load_obj = function(filename) {
+  # create new environment
+  env = new.env()
+  # load the rdata file into the new environment, and get the NAME
+  # of the first object
+  object_name = load(filename, env)[1]
+  # return the object
+  return(env[[object_name]])
+}
+
+# remove unwanted rows or columns (samples and snps, respectively) from
+# the ped data
+filter_ped_data = function(ped_data, xsamples, xsnps) {
+  # rows to remove
+  rr = which(rownames(ped_data) %in% xsamples)
+  # remove rejected samples
+  if (length(rr) != 0) {
+    fpd1 = ped_data[-rr, , drop=FALSE]
+  } else {
+    fpd1 = ped_data
+  }
+  # remove all zero and rejected snp columns
+  snps = which(colnames(ped_data) %in% xsnps)
+  zeros = which(colSums(abs(fpd1)) == 0)
+  cr = union(snps, zeros)
+  if (length(cr) != 0) {
+    fpd2 = fpd1[, -cr, drop=FALSE]
+  } else {
+    fpd2 = fpd1
+  }
+  # remove monomorphic snps
+  snp_sds = apply(fpd2, 2, sd)
+  clean_ped = fpd2[, snp_sds >= 0.01, drop=FALSE]
+  return(clean_ped)
+}
+
+# Ethnicity file requirements:
+# - tab delimited
+# - Must have at least two columns
+# - First column has sample ID's
+# - Second column has ethnicities
+# - First row must be a header
+parse_ethnicity_file = function(eth_filename) {
+  if (file.exists(eth_filename) == FALSE) {
+    print(paste0("Warning: Ethnicity file: ", eth_filename, " not found"))
+    return(NULL)
+  } 
+  if (file.info(eth_filename)$size == 0) {
+    print(paste0("Warning: Ethnicity file: '", eth_filename, "' is empty"))
+    return(NULL)
+  }
+  eth_data = read.table(eth_filename, header=TRUE, sep="\t")
+  rownames(eth_data) = eth_data[, 1]
+  colnames(eth_data)[1] = "IID"
+  colnames(eth_data)[2] = "population"
+  return(eth_data)
+}
+
+# Read in a file and return the first column as a
+# character vector
+get_first_column = function(fname) {
+  rv = c()
+  if (file.exists(fname) == FALSE) {
+    print(paste0("Warning: File: '", fname, "' not found"))
+    return(rv)
+  } 
+  if (file.info(fname)$size == 0) {
+    print(paste0("Warning: File: '", fname, "' is empty"))
+    return(rv)
+  } else {
+    rv = as.character(read.table(fname)[, 1])
+    return(rv)
+  }
+}
+
+# Do pca using flashpcar. Returns a 2 element list
+# values - contains the loadings of the pcs
+#         Will be an n x m matrix, where 
+#         - n = Number of samples
+#         - m = number of pcs
+# ids - Character array of ids, same length as number of rows in values
+do_pca = function(ped_data) {
+  pca_data = list()
+  pm = data.matrix(ped_data)
+  values = flashpca(pm, ndim=6)$vectors
+  pca_data$values = values
+  pca_data$ids = as.character(rownames(ped_data))
+  return(pca_data)
+}
\ No newline at end of file