iterativepca: R_functions/pca_helpers.R comparison

comparison R_functions/pca_helpers.R @ 0:cb54350e76ae draft default tip

Uploaded

author	jason-ellul
date	Wed, 01 Jun 2016 03:24:56 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:cb54350e76ae
+library(flashpcaR)
+# read in data from either a numeric ped file or an rds object
+# output a numeric ped file, with the rownames set to the ids of the samples
+get_source_data = function(data_source, data_type) {
+data_type = tolower(data_type)
+if (data_type == "numeric_ped") {
+# read in ped file
+ped_data = read.table(data_source, sep="\t", row.names=1)
+} else if (data_type == "rds") {
+hapmap3_object = readRDS(data_source)
+ped_data = hapmap3_object$bed
+} else if (data_type == "rdata") {
+hapmap3_object = load_obj(data_source)
+ped_data = hapmap3_object$bed
+} else {
+print("Unrecognised data type, returning NULL")
+ped_data = NULL
+}
+return(ped_data)
+}
+# A function that will read in and return a single object from an RData file
+# This is a workaround so the program can run without needing to know name of the object;
+# however the assumption is that the RData file contains only ONE object (the one we want)
+load_obj = function(filename) {
+# create new environment
+env = new.env()
+# load the rdata file into the new environment, and get the NAME
+# of the first object
+object_name = load(filename, env)[1]
+# return the object
+return(env[[object_name]])
+}
+# remove unwanted rows or columns (samples and snps, respectively) from
+# the ped data
+filter_ped_data = function(ped_data, xsamples, xsnps) {
+# rows to remove
+rr = which(rownames(ped_data) %in% xsamples)
+# remove rejected samples
+if (length(rr) != 0) {
+fpd1 = ped_data[-rr, , drop=FALSE]
+} else {
+fpd1 = ped_data
+}
+# remove all zero and rejected snp columns
+snps = which(colnames(ped_data) %in% xsnps)
+zeros = which(colSums(abs(fpd1)) == 0)
+cr = union(snps, zeros)
+if (length(cr) != 0) {
+fpd2 = fpd1[, -cr, drop=FALSE]
+} else {
+fpd2 = fpd1
+}
+# remove monomorphic snps
+snp_sds = apply(fpd2, 2, sd)
+clean_ped = fpd2[, snp_sds >= 0.01, drop=FALSE]
+return(clean_ped)
+}
+# Ethnicity file requirements:
+# - tab delimited
+# - Must have at least two columns
+# - First column has sample ID's
+# - Second column has ethnicities
+# - First row must be a header
+parse_ethnicity_file = function(eth_filename) {
+if (file.exists(eth_filename) == FALSE) {
+print(paste0("Warning: Ethnicity file: ", eth_filename, " not found"))
+return(NULL)
+}
+if (file.info(eth_filename)$size == 0) {
+print(paste0("Warning: Ethnicity file: '", eth_filename, "' is empty"))
+return(NULL)
+}
+eth_data = read.table(eth_filename, header=TRUE, sep="\t")
+rownames(eth_data) = eth_data[, 1]
+colnames(eth_data)[1] = "IID"
+colnames(eth_data)[2] = "population"
+return(eth_data)
+}
+# Read in a file and return the first column as a
+# character vector
+get_first_column = function(fname) {
+rv = c()
+if (file.exists(fname) == FALSE) {
+print(paste0("Warning: File: '", fname, "' not found"))
+return(rv)
+}
+if (file.info(fname)$size == 0) {
+print(paste0("Warning: File: '", fname, "' is empty"))
+return(rv)
+} else {
+rv = as.character(read.table(fname)[, 1])
+return(rv)
+}
+}
+# Do pca using flashpcar. Returns a 2 element list
+# values - contains the loadings of the pcs
+#         Will be an n x m matrix, where
+#         - n = Number of samples
+#         - m = number of pcs
+# ids - Character array of ids, same length as number of rows in values
+do_pca = function(ped_data) {
+pca_data = list()
+pm = data.matrix(ped_data)
+values = flashpca(pm, ndim=6)$vectors
+pca_data$values = values
+pca_data$ids = as.character(rownames(ped_data))
+return(pca_data)
+}

Mercurial > repos > jason-ellul > iterativepca

comparison R_functions/pca_helpers.R @ 0:cb54350e76ae draft default tip