view R_functions/pca_helpers.R @ 0:64e75e21466e draft default tip

Uploaded
author pmac
date Wed, 01 Jun 2016 03:38:39 -0400
parents
children
line wrap: on
line source

library(flashpcaR)

# read in data from either a numeric ped file or an rds object
# output a numeric ped file, with the rownames set to the ids of the samples
get_source_data = function(data_source, data_type) {
  data_type = tolower(data_type)
  if (data_type == "numeric_ped") {
    # read in ped file
    ped_data = read.table(data_source, sep="\t", row.names=1)
  } else if (data_type == "rds") {
    hapmap3_object = readRDS(data_source)
    ped_data = hapmap3_object$bed
  } else if (data_type == "rdata") {
    hapmap3_object = load_obj(data_source)
    ped_data = hapmap3_object$bed
  } else {
    print("Unrecognised data type, returning NULL")
    ped_data = NULL
  }
  return(ped_data)
}

# A function that will read in and return a single object from an RData file
# This is a workaround so the program can run without needing to know name of the object;
# however the assumption is that the RData file contains only ONE object (the one we want)
load_obj = function(filename) {
  # create new environment
  env = new.env()
  # load the rdata file into the new environment, and get the NAME
  # of the first object
  object_name = load(filename, env)[1]
  # return the object
  return(env[[object_name]])
}

# remove unwanted rows or columns (samples and snps, respectively) from
# the ped data
filter_ped_data = function(ped_data, xsamples, xsnps) {
  # rows to remove
  rr = which(rownames(ped_data) %in% xsamples)
  # remove rejected samples
  if (length(rr) != 0) {
    fpd1 = ped_data[-rr, , drop=FALSE]
  } else {
    fpd1 = ped_data
  }
  # remove all zero and rejected snp columns
  snps = which(colnames(ped_data) %in% xsnps)
  zeros = which(colSums(abs(fpd1)) == 0)
  cr = union(snps, zeros)
  if (length(cr) != 0) {
    fpd2 = fpd1[, -cr, drop=FALSE]
  } else {
    fpd2 = fpd1
  }
  # remove monomorphic snps
  snp_sds = apply(fpd2, 2, sd)
  clean_ped = fpd2[, snp_sds >= 0.01, drop=FALSE]
  return(clean_ped)
}

# Ethnicity file requirements:
# - tab delimited
# - Must have at least two columns
# - First column has sample ID's
# - Second column has ethnicities
# - First row must be a header
parse_ethnicity_file = function(eth_filename) {
  if (file.exists(eth_filename) == FALSE) {
    print(paste0("Warning: Ethnicity file: ", eth_filename, " not found"))
    return(NULL)
  } 
  if (file.info(eth_filename)$size == 0) {
    print(paste0("Warning: Ethnicity file: '", eth_filename, "' is empty"))
    return(NULL)
  }
  eth_data = read.table(eth_filename, header=TRUE, sep="\t")
  rownames(eth_data) = eth_data[, 1]
  colnames(eth_data)[1] = "IID"
  colnames(eth_data)[2] = "population"
  return(eth_data)
}

# Read in a file and return the first column as a
# character vector
get_first_column = function(fname) {
  rv = c()
  if (file.exists(fname) == FALSE) {
    print(paste0("Warning: File: '", fname, "' not found"))
    return(rv)
  } 
  if (file.info(fname)$size == 0) {
    print(paste0("Warning: File: '", fname, "' is empty"))
    return(rv)
  } else {
    rv = as.character(read.table(fname)[, 1])
    return(rv)
  }
}

# Do pca using flashpcar. Returns a 2 element list
# values - contains the loadings of the pcs
#         Will be an n x m matrix, where 
#         - n = Number of samples
#         - m = number of pcs
# ids - Character array of ids, same length as number of rows in values
do_pca = function(ped_data) {
  pca_data = list()
  pm = data.matrix(ped_data)
  values = flashpca(pm, ndim=6)$vectors
  pca_data$values = values
  pca_data$ids = as.character(rownames(ped_data))
  return(pca_data)
}