| 
0
 | 
     1 library(flashpcaR)
 | 
| 
 | 
     2 
 | 
| 
 | 
     3 # read in data from either a numeric ped file or an rds object
 | 
| 
 | 
     4 # output a numeric ped file, with the rownames set to the ids of the samples
 | 
| 
 | 
     5 get_source_data = function(data_source, data_type) {
 | 
| 
 | 
     6   data_type = tolower(data_type)
 | 
| 
 | 
     7   if (data_type == "numeric_ped") {
 | 
| 
 | 
     8     # read in ped file
 | 
| 
 | 
     9     ped_data = read.table(data_source, sep="\t", row.names=1)
 | 
| 
 | 
    10   } else if (data_type == "rds") {
 | 
| 
 | 
    11     hapmap3_object = readRDS(data_source)
 | 
| 
 | 
    12     ped_data = hapmap3_object$bed
 | 
| 
 | 
    13   } else if (data_type == "rdata") {
 | 
| 
 | 
    14     hapmap3_object = load_obj(data_source)
 | 
| 
 | 
    15     ped_data = hapmap3_object$bed
 | 
| 
 | 
    16   } else {
 | 
| 
 | 
    17     print("Unrecognised data type, returning NULL")
 | 
| 
 | 
    18     ped_data = NULL
 | 
| 
 | 
    19   }
 | 
| 
 | 
    20   return(ped_data)
 | 
| 
 | 
    21 }
 | 
| 
 | 
    22 
 | 
| 
 | 
    23 # A function that will read in and return a single object from an RData file
 | 
| 
 | 
    24 # This is a workaround so the program can run without needing to know name of the object;
 | 
| 
 | 
    25 # however the assumption is that the RData file contains only ONE object (the one we want)
 | 
| 
 | 
    26 load_obj = function(filename) {
 | 
| 
 | 
    27   # create new environment
 | 
| 
 | 
    28   env = new.env()
 | 
| 
 | 
    29   # load the rdata file into the new environment, and get the NAME
 | 
| 
 | 
    30   # of the first object
 | 
| 
 | 
    31   object_name = load(filename, env)[1]
 | 
| 
 | 
    32   # return the object
 | 
| 
 | 
    33   return(env[[object_name]])
 | 
| 
 | 
    34 }
 | 
| 
 | 
    35 
 | 
| 
 | 
    36 # remove unwanted rows or columns (samples and snps, respectively) from
 | 
| 
 | 
    37 # the ped data
 | 
| 
 | 
    38 filter_ped_data = function(ped_data, xsamples, xsnps) {
 | 
| 
 | 
    39   # rows to remove
 | 
| 
 | 
    40   rr = which(rownames(ped_data) %in% xsamples)
 | 
| 
 | 
    41   # remove rejected samples
 | 
| 
 | 
    42   if (length(rr) != 0) {
 | 
| 
 | 
    43     fpd1 = ped_data[-rr, , drop=FALSE]
 | 
| 
 | 
    44   } else {
 | 
| 
 | 
    45     fpd1 = ped_data
 | 
| 
 | 
    46   }
 | 
| 
 | 
    47   # remove all zero and rejected snp columns
 | 
| 
 | 
    48   snps = which(colnames(ped_data) %in% xsnps)
 | 
| 
 | 
    49   zeros = which(colSums(abs(fpd1)) == 0)
 | 
| 
 | 
    50   cr = union(snps, zeros)
 | 
| 
 | 
    51   if (length(cr) != 0) {
 | 
| 
 | 
    52     fpd2 = fpd1[, -cr, drop=FALSE]
 | 
| 
 | 
    53   } else {
 | 
| 
 | 
    54     fpd2 = fpd1
 | 
| 
 | 
    55   }
 | 
| 
 | 
    56   # remove monomorphic snps
 | 
| 
 | 
    57   snp_sds = apply(fpd2, 2, sd)
 | 
| 
 | 
    58   clean_ped = fpd2[, snp_sds >= 0.01, drop=FALSE]
 | 
| 
 | 
    59   return(clean_ped)
 | 
| 
 | 
    60 }
 | 
| 
 | 
    61 
 | 
| 
 | 
    62 # Ethnicity file requirements:
 | 
| 
 | 
    63 # - tab delimited
 | 
| 
 | 
    64 # - Must have at least two columns
 | 
| 
 | 
    65 # - First column has sample ID's
 | 
| 
 | 
    66 # - Second column has ethnicities
 | 
| 
 | 
    67 # - First row must be a header
 | 
| 
 | 
    68 parse_ethnicity_file = function(eth_filename) {
 | 
| 
 | 
    69   if (file.exists(eth_filename) == FALSE) {
 | 
| 
 | 
    70     print(paste0("Warning: Ethnicity file: ", eth_filename, " not found"))
 | 
| 
 | 
    71     return(NULL)
 | 
| 
 | 
    72   } 
 | 
| 
 | 
    73   if (file.info(eth_filename)$size == 0) {
 | 
| 
 | 
    74     print(paste0("Warning: Ethnicity file: '", eth_filename, "' is empty"))
 | 
| 
 | 
    75     return(NULL)
 | 
| 
 | 
    76   }
 | 
| 
 | 
    77   eth_data = read.table(eth_filename, header=TRUE, sep="\t")
 | 
| 
 | 
    78   rownames(eth_data) = eth_data[, 1]
 | 
| 
 | 
    79   colnames(eth_data)[1] = "IID"
 | 
| 
 | 
    80   colnames(eth_data)[2] = "population"
 | 
| 
 | 
    81   return(eth_data)
 | 
| 
 | 
    82 }
 | 
| 
 | 
    83 
 | 
| 
 | 
    84 # Read in a file and return the first column as a
 | 
| 
 | 
    85 # character vector
 | 
| 
 | 
    86 get_first_column = function(fname) {
 | 
| 
 | 
    87   rv = c()
 | 
| 
 | 
    88   if (file.exists(fname) == FALSE) {
 | 
| 
 | 
    89     print(paste0("Warning: File: '", fname, "' not found"))
 | 
| 
 | 
    90     return(rv)
 | 
| 
 | 
    91   } 
 | 
| 
 | 
    92   if (file.info(fname)$size == 0) {
 | 
| 
 | 
    93     print(paste0("Warning: File: '", fname, "' is empty"))
 | 
| 
 | 
    94     return(rv)
 | 
| 
 | 
    95   } else {
 | 
| 
 | 
    96     rv = as.character(read.table(fname)[, 1])
 | 
| 
 | 
    97     return(rv)
 | 
| 
 | 
    98   }
 | 
| 
 | 
    99 }
 | 
| 
 | 
   100 
 | 
| 
 | 
   101 # Do pca using flashpcar. Returns a 2 element list
 | 
| 
 | 
   102 # values - contains the loadings of the pcs
 | 
| 
 | 
   103 #         Will be an n x m matrix, where 
 | 
| 
 | 
   104 #         - n = Number of samples
 | 
| 
 | 
   105 #         - m = number of pcs
 | 
| 
 | 
   106 # ids - Character array of ids, same length as number of rows in values
 | 
| 
 | 
   107 do_pca = function(ped_data) {
 | 
| 
 | 
   108   pca_data = list()
 | 
| 
 | 
   109   pm = data.matrix(ped_data)
 | 
| 
 | 
   110   values = flashpca(pm, ndim=6)$vectors
 | 
| 
 | 
   111   pca_data$values = values
 | 
| 
 | 
   112   pca_data$ids = as.character(rownames(ped_data))
 | 
| 
 | 
   113   return(pca_data)
 | 
| 
 | 
   114 } |