annotate R_functions/pca_helpers.R @ 0:64e75e21466e draft default tip

Uploaded
author pmac
date Wed, 01 Jun 2016 03:38:39 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
64e75e21466e Uploaded
pmac
parents:
diff changeset
1 library(flashpcaR)
64e75e21466e Uploaded
pmac
parents:
diff changeset
2
64e75e21466e Uploaded
pmac
parents:
diff changeset
3 # read in data from either a numeric ped file or an rds object
64e75e21466e Uploaded
pmac
parents:
diff changeset
4 # output a numeric ped file, with the rownames set to the ids of the samples
64e75e21466e Uploaded
pmac
parents:
diff changeset
5 get_source_data = function(data_source, data_type) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
6 data_type = tolower(data_type)
64e75e21466e Uploaded
pmac
parents:
diff changeset
7 if (data_type == "numeric_ped") {
64e75e21466e Uploaded
pmac
parents:
diff changeset
8 # read in ped file
64e75e21466e Uploaded
pmac
parents:
diff changeset
9 ped_data = read.table(data_source, sep="\t", row.names=1)
64e75e21466e Uploaded
pmac
parents:
diff changeset
10 } else if (data_type == "rds") {
64e75e21466e Uploaded
pmac
parents:
diff changeset
11 hapmap3_object = readRDS(data_source)
64e75e21466e Uploaded
pmac
parents:
diff changeset
12 ped_data = hapmap3_object$bed
64e75e21466e Uploaded
pmac
parents:
diff changeset
13 } else if (data_type == "rdata") {
64e75e21466e Uploaded
pmac
parents:
diff changeset
14 hapmap3_object = load_obj(data_source)
64e75e21466e Uploaded
pmac
parents:
diff changeset
15 ped_data = hapmap3_object$bed
64e75e21466e Uploaded
pmac
parents:
diff changeset
16 } else {
64e75e21466e Uploaded
pmac
parents:
diff changeset
17 print("Unrecognised data type, returning NULL")
64e75e21466e Uploaded
pmac
parents:
diff changeset
18 ped_data = NULL
64e75e21466e Uploaded
pmac
parents:
diff changeset
19 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
20 return(ped_data)
64e75e21466e Uploaded
pmac
parents:
diff changeset
21 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
22
64e75e21466e Uploaded
pmac
parents:
diff changeset
23 # A function that will read in and return a single object from an RData file
64e75e21466e Uploaded
pmac
parents:
diff changeset
24 # This is a workaround so the program can run without needing to know name of the object;
64e75e21466e Uploaded
pmac
parents:
diff changeset
25 # however the assumption is that the RData file contains only ONE object (the one we want)
64e75e21466e Uploaded
pmac
parents:
diff changeset
26 load_obj = function(filename) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
27 # create new environment
64e75e21466e Uploaded
pmac
parents:
diff changeset
28 env = new.env()
64e75e21466e Uploaded
pmac
parents:
diff changeset
29 # load the rdata file into the new environment, and get the NAME
64e75e21466e Uploaded
pmac
parents:
diff changeset
30 # of the first object
64e75e21466e Uploaded
pmac
parents:
diff changeset
31 object_name = load(filename, env)[1]
64e75e21466e Uploaded
pmac
parents:
diff changeset
32 # return the object
64e75e21466e Uploaded
pmac
parents:
diff changeset
33 return(env[[object_name]])
64e75e21466e Uploaded
pmac
parents:
diff changeset
34 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
35
64e75e21466e Uploaded
pmac
parents:
diff changeset
36 # remove unwanted rows or columns (samples and snps, respectively) from
64e75e21466e Uploaded
pmac
parents:
diff changeset
37 # the ped data
64e75e21466e Uploaded
pmac
parents:
diff changeset
38 filter_ped_data = function(ped_data, xsamples, xsnps) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
39 # rows to remove
64e75e21466e Uploaded
pmac
parents:
diff changeset
40 rr = which(rownames(ped_data) %in% xsamples)
64e75e21466e Uploaded
pmac
parents:
diff changeset
41 # remove rejected samples
64e75e21466e Uploaded
pmac
parents:
diff changeset
42 if (length(rr) != 0) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
43 fpd1 = ped_data[-rr, , drop=FALSE]
64e75e21466e Uploaded
pmac
parents:
diff changeset
44 } else {
64e75e21466e Uploaded
pmac
parents:
diff changeset
45 fpd1 = ped_data
64e75e21466e Uploaded
pmac
parents:
diff changeset
46 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
47 # remove all zero and rejected snp columns
64e75e21466e Uploaded
pmac
parents:
diff changeset
48 snps = which(colnames(ped_data) %in% xsnps)
64e75e21466e Uploaded
pmac
parents:
diff changeset
49 zeros = which(colSums(abs(fpd1)) == 0)
64e75e21466e Uploaded
pmac
parents:
diff changeset
50 cr = union(snps, zeros)
64e75e21466e Uploaded
pmac
parents:
diff changeset
51 if (length(cr) != 0) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
52 fpd2 = fpd1[, -cr, drop=FALSE]
64e75e21466e Uploaded
pmac
parents:
diff changeset
53 } else {
64e75e21466e Uploaded
pmac
parents:
diff changeset
54 fpd2 = fpd1
64e75e21466e Uploaded
pmac
parents:
diff changeset
55 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
56 # remove monomorphic snps
64e75e21466e Uploaded
pmac
parents:
diff changeset
57 snp_sds = apply(fpd2, 2, sd)
64e75e21466e Uploaded
pmac
parents:
diff changeset
58 clean_ped = fpd2[, snp_sds >= 0.01, drop=FALSE]
64e75e21466e Uploaded
pmac
parents:
diff changeset
59 return(clean_ped)
64e75e21466e Uploaded
pmac
parents:
diff changeset
60 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
61
64e75e21466e Uploaded
pmac
parents:
diff changeset
62 # Ethnicity file requirements:
64e75e21466e Uploaded
pmac
parents:
diff changeset
63 # - tab delimited
64e75e21466e Uploaded
pmac
parents:
diff changeset
64 # - Must have at least two columns
64e75e21466e Uploaded
pmac
parents:
diff changeset
65 # - First column has sample ID's
64e75e21466e Uploaded
pmac
parents:
diff changeset
66 # - Second column has ethnicities
64e75e21466e Uploaded
pmac
parents:
diff changeset
67 # - First row must be a header
64e75e21466e Uploaded
pmac
parents:
diff changeset
68 parse_ethnicity_file = function(eth_filename) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
69 if (file.exists(eth_filename) == FALSE) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
70 print(paste0("Warning: Ethnicity file: ", eth_filename, " not found"))
64e75e21466e Uploaded
pmac
parents:
diff changeset
71 return(NULL)
64e75e21466e Uploaded
pmac
parents:
diff changeset
72 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
73 if (file.info(eth_filename)$size == 0) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
74 print(paste0("Warning: Ethnicity file: '", eth_filename, "' is empty"))
64e75e21466e Uploaded
pmac
parents:
diff changeset
75 return(NULL)
64e75e21466e Uploaded
pmac
parents:
diff changeset
76 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
77 eth_data = read.table(eth_filename, header=TRUE, sep="\t")
64e75e21466e Uploaded
pmac
parents:
diff changeset
78 rownames(eth_data) = eth_data[, 1]
64e75e21466e Uploaded
pmac
parents:
diff changeset
79 colnames(eth_data)[1] = "IID"
64e75e21466e Uploaded
pmac
parents:
diff changeset
80 colnames(eth_data)[2] = "population"
64e75e21466e Uploaded
pmac
parents:
diff changeset
81 return(eth_data)
64e75e21466e Uploaded
pmac
parents:
diff changeset
82 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
83
64e75e21466e Uploaded
pmac
parents:
diff changeset
84 # Read in a file and return the first column as a
64e75e21466e Uploaded
pmac
parents:
diff changeset
85 # character vector
64e75e21466e Uploaded
pmac
parents:
diff changeset
86 get_first_column = function(fname) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
87 rv = c()
64e75e21466e Uploaded
pmac
parents:
diff changeset
88 if (file.exists(fname) == FALSE) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
89 print(paste0("Warning: File: '", fname, "' not found"))
64e75e21466e Uploaded
pmac
parents:
diff changeset
90 return(rv)
64e75e21466e Uploaded
pmac
parents:
diff changeset
91 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
92 if (file.info(fname)$size == 0) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
93 print(paste0("Warning: File: '", fname, "' is empty"))
64e75e21466e Uploaded
pmac
parents:
diff changeset
94 return(rv)
64e75e21466e Uploaded
pmac
parents:
diff changeset
95 } else {
64e75e21466e Uploaded
pmac
parents:
diff changeset
96 rv = as.character(read.table(fname)[, 1])
64e75e21466e Uploaded
pmac
parents:
diff changeset
97 return(rv)
64e75e21466e Uploaded
pmac
parents:
diff changeset
98 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
99 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
100
64e75e21466e Uploaded
pmac
parents:
diff changeset
101 # Do pca using flashpcar. Returns a 2 element list
64e75e21466e Uploaded
pmac
parents:
diff changeset
102 # values - contains the loadings of the pcs
64e75e21466e Uploaded
pmac
parents:
diff changeset
103 # Will be an n x m matrix, where
64e75e21466e Uploaded
pmac
parents:
diff changeset
104 # - n = Number of samples
64e75e21466e Uploaded
pmac
parents:
diff changeset
105 # - m = number of pcs
64e75e21466e Uploaded
pmac
parents:
diff changeset
106 # ids - Character array of ids, same length as number of rows in values
64e75e21466e Uploaded
pmac
parents:
diff changeset
107 do_pca = function(ped_data) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
108 pca_data = list()
64e75e21466e Uploaded
pmac
parents:
diff changeset
109 pm = data.matrix(ped_data)
64e75e21466e Uploaded
pmac
parents:
diff changeset
110 values = flashpca(pm, ndim=6)$vectors
64e75e21466e Uploaded
pmac
parents:
diff changeset
111 pca_data$values = values
64e75e21466e Uploaded
pmac
parents:
diff changeset
112 pca_data$ids = as.character(rownames(ped_data))
64e75e21466e Uploaded
pmac
parents:
diff changeset
113 return(pca_data)
64e75e21466e Uploaded
pmac
parents:
diff changeset
114 }