Mercurial > repos > jason-ellul > iterativepca
diff R_functions/pca_helpers.R @ 0:cb54350e76ae draft default tip
Uploaded
| author | jason-ellul | 
|---|---|
| date | Wed, 01 Jun 2016 03:24:56 -0400 | 
| parents | |
| children | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/R_functions/pca_helpers.R Wed Jun 01 03:24:56 2016 -0400 @@ -0,0 +1,114 @@ +library(flashpcaR) + +# read in data from either a numeric ped file or an rds object +# output a numeric ped file, with the rownames set to the ids of the samples +get_source_data = function(data_source, data_type) { + data_type = tolower(data_type) + if (data_type == "numeric_ped") { + # read in ped file + ped_data = read.table(data_source, sep="\t", row.names=1) + } else if (data_type == "rds") { + hapmap3_object = readRDS(data_source) + ped_data = hapmap3_object$bed + } else if (data_type == "rdata") { + hapmap3_object = load_obj(data_source) + ped_data = hapmap3_object$bed + } else { + print("Unrecognised data type, returning NULL") + ped_data = NULL + } + return(ped_data) +} + +# A function that will read in and return a single object from an RData file +# This is a workaround so the program can run without needing to know name of the object; +# however the assumption is that the RData file contains only ONE object (the one we want) +load_obj = function(filename) { + # create new environment + env = new.env() + # load the rdata file into the new environment, and get the NAME + # of the first object + object_name = load(filename, env)[1] + # return the object + return(env[[object_name]]) +} + +# remove unwanted rows or columns (samples and snps, respectively) from +# the ped data +filter_ped_data = function(ped_data, xsamples, xsnps) { + # rows to remove + rr = which(rownames(ped_data) %in% xsamples) + # remove rejected samples + if (length(rr) != 0) { + fpd1 = ped_data[-rr, , drop=FALSE] + } else { + fpd1 = ped_data + } + # remove all zero and rejected snp columns + snps = which(colnames(ped_data) %in% xsnps) + zeros = which(colSums(abs(fpd1)) == 0) + cr = union(snps, zeros) + if (length(cr) != 0) { + fpd2 = fpd1[, -cr, drop=FALSE] + } else { + fpd2 = fpd1 + } + # remove monomorphic snps + snp_sds = apply(fpd2, 2, sd) + clean_ped = fpd2[, snp_sds >= 0.01, drop=FALSE] + return(clean_ped) +} + +# Ethnicity file requirements: +# - tab delimited +# - Must have at least two columns +# - First column has sample ID's +# - Second column has ethnicities +# - First row must be a header +parse_ethnicity_file = function(eth_filename) { + if (file.exists(eth_filename) == FALSE) { + print(paste0("Warning: Ethnicity file: ", eth_filename, " not found")) + return(NULL) + } + if (file.info(eth_filename)$size == 0) { + print(paste0("Warning: Ethnicity file: '", eth_filename, "' is empty")) + return(NULL) + } + eth_data = read.table(eth_filename, header=TRUE, sep="\t") + rownames(eth_data) = eth_data[, 1] + colnames(eth_data)[1] = "IID" + colnames(eth_data)[2] = "population" + return(eth_data) +} + +# Read in a file and return the first column as a +# character vector +get_first_column = function(fname) { + rv = c() + if (file.exists(fname) == FALSE) { + print(paste0("Warning: File: '", fname, "' not found")) + return(rv) + } + if (file.info(fname)$size == 0) { + print(paste0("Warning: File: '", fname, "' is empty")) + return(rv) + } else { + rv = as.character(read.table(fname)[, 1]) + return(rv) + } +} + +# Do pca using flashpcar. Returns a 2 element list +# values - contains the loadings of the pcs +# Will be an n x m matrix, where +# - n = Number of samples +# - m = number of pcs +# ids - Character array of ids, same length as number of rows in values +do_pca = function(ped_data) { + pca_data = list() + pm = data.matrix(ped_data) + values = flashpca(pm, ndim=6)$vectors + pca_data$values = values + pca_data$ids = as.character(rownames(ped_data)) + return(pca_data) +} \ No newline at end of file
