Mercurial > repos > jason-ellul > iterativepca
comparison R_functions/pca_helpers.R @ 0:cb54350e76ae draft default tip
Uploaded
| author | jason-ellul |
|---|---|
| date | Wed, 01 Jun 2016 03:24:56 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:cb54350e76ae |
|---|---|
| 1 library(flashpcaR) | |
| 2 | |
| 3 # read in data from either a numeric ped file or an rds object | |
| 4 # output a numeric ped file, with the rownames set to the ids of the samples | |
| 5 get_source_data = function(data_source, data_type) { | |
| 6 data_type = tolower(data_type) | |
| 7 if (data_type == "numeric_ped") { | |
| 8 # read in ped file | |
| 9 ped_data = read.table(data_source, sep="\t", row.names=1) | |
| 10 } else if (data_type == "rds") { | |
| 11 hapmap3_object = readRDS(data_source) | |
| 12 ped_data = hapmap3_object$bed | |
| 13 } else if (data_type == "rdata") { | |
| 14 hapmap3_object = load_obj(data_source) | |
| 15 ped_data = hapmap3_object$bed | |
| 16 } else { | |
| 17 print("Unrecognised data type, returning NULL") | |
| 18 ped_data = NULL | |
| 19 } | |
| 20 return(ped_data) | |
| 21 } | |
| 22 | |
| 23 # A function that will read in and return a single object from an RData file | |
| 24 # This is a workaround so the program can run without needing to know name of the object; | |
| 25 # however the assumption is that the RData file contains only ONE object (the one we want) | |
| 26 load_obj = function(filename) { | |
| 27 # create new environment | |
| 28 env = new.env() | |
| 29 # load the rdata file into the new environment, and get the NAME | |
| 30 # of the first object | |
| 31 object_name = load(filename, env)[1] | |
| 32 # return the object | |
| 33 return(env[[object_name]]) | |
| 34 } | |
| 35 | |
| 36 # remove unwanted rows or columns (samples and snps, respectively) from | |
| 37 # the ped data | |
| 38 filter_ped_data = function(ped_data, xsamples, xsnps) { | |
| 39 # rows to remove | |
| 40 rr = which(rownames(ped_data) %in% xsamples) | |
| 41 # remove rejected samples | |
| 42 if (length(rr) != 0) { | |
| 43 fpd1 = ped_data[-rr, , drop=FALSE] | |
| 44 } else { | |
| 45 fpd1 = ped_data | |
| 46 } | |
| 47 # remove all zero and rejected snp columns | |
| 48 snps = which(colnames(ped_data) %in% xsnps) | |
| 49 zeros = which(colSums(abs(fpd1)) == 0) | |
| 50 cr = union(snps, zeros) | |
| 51 if (length(cr) != 0) { | |
| 52 fpd2 = fpd1[, -cr, drop=FALSE] | |
| 53 } else { | |
| 54 fpd2 = fpd1 | |
| 55 } | |
| 56 # remove monomorphic snps | |
| 57 snp_sds = apply(fpd2, 2, sd) | |
| 58 clean_ped = fpd2[, snp_sds >= 0.01, drop=FALSE] | |
| 59 return(clean_ped) | |
| 60 } | |
| 61 | |
| 62 # Ethnicity file requirements: | |
| 63 # - tab delimited | |
| 64 # - Must have at least two columns | |
| 65 # - First column has sample ID's | |
| 66 # - Second column has ethnicities | |
| 67 # - First row must be a header | |
| 68 parse_ethnicity_file = function(eth_filename) { | |
| 69 if (file.exists(eth_filename) == FALSE) { | |
| 70 print(paste0("Warning: Ethnicity file: ", eth_filename, " not found")) | |
| 71 return(NULL) | |
| 72 } | |
| 73 if (file.info(eth_filename)$size == 0) { | |
| 74 print(paste0("Warning: Ethnicity file: '", eth_filename, "' is empty")) | |
| 75 return(NULL) | |
| 76 } | |
| 77 eth_data = read.table(eth_filename, header=TRUE, sep="\t") | |
| 78 rownames(eth_data) = eth_data[, 1] | |
| 79 colnames(eth_data)[1] = "IID" | |
| 80 colnames(eth_data)[2] = "population" | |
| 81 return(eth_data) | |
| 82 } | |
| 83 | |
| 84 # Read in a file and return the first column as a | |
| 85 # character vector | |
| 86 get_first_column = function(fname) { | |
| 87 rv = c() | |
| 88 if (file.exists(fname) == FALSE) { | |
| 89 print(paste0("Warning: File: '", fname, "' not found")) | |
| 90 return(rv) | |
| 91 } | |
| 92 if (file.info(fname)$size == 0) { | |
| 93 print(paste0("Warning: File: '", fname, "' is empty")) | |
| 94 return(rv) | |
| 95 } else { | |
| 96 rv = as.character(read.table(fname)[, 1]) | |
| 97 return(rv) | |
| 98 } | |
| 99 } | |
| 100 | |
| 101 # Do pca using flashpcar. Returns a 2 element list | |
| 102 # values - contains the loadings of the pcs | |
| 103 # Will be an n x m matrix, where | |
| 104 # - n = Number of samples | |
| 105 # - m = number of pcs | |
| 106 # ids - Character array of ids, same length as number of rows in values | |
| 107 do_pca = function(ped_data) { | |
| 108 pca_data = list() | |
| 109 pm = data.matrix(ped_data) | |
| 110 values = flashpca(pm, ndim=6)$vectors | |
| 111 pca_data$values = values | |
| 112 pca_data$ids = as.character(rownames(ped_data)) | |
| 113 return(pca_data) | |
| 114 } |
