Mercurial > repos > pmac > iterativepca
view R_functions/pca_helpers.R @ 0:64e75e21466e draft default tip
Uploaded
author | pmac |
---|---|
date | Wed, 01 Jun 2016 03:38:39 -0400 |
parents | |
children |
line wrap: on
line source
library(flashpcaR) # read in data from either a numeric ped file or an rds object # output a numeric ped file, with the rownames set to the ids of the samples get_source_data = function(data_source, data_type) { data_type = tolower(data_type) if (data_type == "numeric_ped") { # read in ped file ped_data = read.table(data_source, sep="\t", row.names=1) } else if (data_type == "rds") { hapmap3_object = readRDS(data_source) ped_data = hapmap3_object$bed } else if (data_type == "rdata") { hapmap3_object = load_obj(data_source) ped_data = hapmap3_object$bed } else { print("Unrecognised data type, returning NULL") ped_data = NULL } return(ped_data) } # A function that will read in and return a single object from an RData file # This is a workaround so the program can run without needing to know name of the object; # however the assumption is that the RData file contains only ONE object (the one we want) load_obj = function(filename) { # create new environment env = new.env() # load the rdata file into the new environment, and get the NAME # of the first object object_name = load(filename, env)[1] # return the object return(env[[object_name]]) } # remove unwanted rows or columns (samples and snps, respectively) from # the ped data filter_ped_data = function(ped_data, xsamples, xsnps) { # rows to remove rr = which(rownames(ped_data) %in% xsamples) # remove rejected samples if (length(rr) != 0) { fpd1 = ped_data[-rr, , drop=FALSE] } else { fpd1 = ped_data } # remove all zero and rejected snp columns snps = which(colnames(ped_data) %in% xsnps) zeros = which(colSums(abs(fpd1)) == 0) cr = union(snps, zeros) if (length(cr) != 0) { fpd2 = fpd1[, -cr, drop=FALSE] } else { fpd2 = fpd1 } # remove monomorphic snps snp_sds = apply(fpd2, 2, sd) clean_ped = fpd2[, snp_sds >= 0.01, drop=FALSE] return(clean_ped) } # Ethnicity file requirements: # - tab delimited # - Must have at least two columns # - First column has sample ID's # - Second column has ethnicities # - First row must be a header parse_ethnicity_file = function(eth_filename) { if (file.exists(eth_filename) == FALSE) { print(paste0("Warning: Ethnicity file: ", eth_filename, " not found")) return(NULL) } if (file.info(eth_filename)$size == 0) { print(paste0("Warning: Ethnicity file: '", eth_filename, "' is empty")) return(NULL) } eth_data = read.table(eth_filename, header=TRUE, sep="\t") rownames(eth_data) = eth_data[, 1] colnames(eth_data)[1] = "IID" colnames(eth_data)[2] = "population" return(eth_data) } # Read in a file and return the first column as a # character vector get_first_column = function(fname) { rv = c() if (file.exists(fname) == FALSE) { print(paste0("Warning: File: '", fname, "' not found")) return(rv) } if (file.info(fname)$size == 0) { print(paste0("Warning: File: '", fname, "' is empty")) return(rv) } else { rv = as.character(read.table(fname)[, 1]) return(rv) } } # Do pca using flashpcar. Returns a 2 element list # values - contains the loadings of the pcs # Will be an n x m matrix, where # - n = Number of samples # - m = number of pcs # ids - Character array of ids, same length as number of rows in values do_pca = function(ped_data) { pca_data = list() pm = data.matrix(ped_data) values = flashpca(pm, ndim=6)$vectors pca_data$values = values pca_data$ids = as.character(rownames(ped_data)) return(pca_data) }