Mercurial > repos > nicolas > oghma
comparison encode.R @ 29:89175737f16b draft
Uploaded
| author | nicolas |
|---|---|
| date | Tue, 25 Oct 2016 14:38:58 -0400 |
| parents | f3f230290ffe |
| children |
comparison
equal
deleted
inserted
replaced
| 28:40664d2d295f | 29:89175737f16b |
|---|---|
| 4 # last modification : 17/09/16 | 4 # last modification : 17/09/16 |
| 5 # author : Dr Nicolas Beaume | 5 # author : Dr Nicolas Beaume |
| 6 # owner : IRRI | 6 # owner : IRRI |
| 7 # | 7 # |
| 8 ######################################################## | 8 ######################################################## |
| 9 | |
| 10 log <- file(paste(getwd(), "log_encode.txt", sep="/"), open = "wt") | |
| 11 sink(file = log, type="message") | |
| 12 | 9 |
| 13 ############################ helper functions ####################### | 10 ############################ helper functions ####################### |
| 14 | 11 |
| 15 # encode one position in one individual | 12 # encode one position in one individual |
| 16 encodeGenotype.position <- function(x, major, code=c(0,1,2), sep=""){ | 13 encodeGenotype.position <- function(x, major, code=c(0,1,2), sep=""){ |
| 31 } else { | 28 } else { |
| 32 res <- code[1] | 29 res <- code[1] |
| 33 } | 30 } |
| 34 } | 31 } |
| 35 } else { | 32 } else { |
| 33 # keep NA as NA | |
| 36 res <- NA | 34 res <- NA |
| 37 } | 35 } |
| 38 return(res) | 36 return(res) |
| 39 } | 37 } |
| 40 | 38 |
| 41 # rewrite a marker to determine the major allele | 39 # rewrite a marker to make an exact count of the allele for the current marker |
| 42 encodeGenotype.rewrite <- function(x, sep=""){ | 40 encodeGenotype.rewrite <- function(x, sep=""){ |
| 43 res <- x | 41 res <- x |
| 44 if(!is.na(x)) { | 42 if(!is.na(x)) { |
| 45 if(length(unlist(strsplit(x,sep)))==1) { | 43 if(length(unlist(strsplit(x,sep)))==1) { |
| 46 # in case of homozygous, must be counted 2 times | 44 # in case of homozygous, must be counted 2 times so the caracter is written 2 times |
| 47 res <- c(x,x) | 45 res <- c(x,x) |
| 48 } else { | 46 } else { |
| 47 # heterozygous | |
| 49 res <- unlist(strsplit(x, split=sep)) | 48 res <- unlist(strsplit(x, split=sep)) |
| 50 } | 49 } |
| 51 } else { | 50 } else { |
| 52 res <- NA | 51 res <- NA |
| 53 } | 52 } |
| 54 return(res) | 53 return(res) |
| 55 } | 54 } |
| 56 | 55 |
| 57 # encode one individual | 56 # encode one individual |
| 58 encodeGenotype.vec <- function(indiv, sep="", code=c(0,1,2)){ | 57 encodeGenotype.vec <- function(indiv, sep="", code=c(0,1,2)){ |
| 58 # rewrite genotype to make sure everything is written as a double caracter value | |
| 59 newIndiv <- unlist(lapply(as.character(indiv), encodeGenotype.rewrite, sep)) | 59 newIndiv <- unlist(lapply(as.character(indiv), encodeGenotype.rewrite, sep)) |
| 60 # compute the occurcence of each genotype to determine major an minor allele | |
| 60 stat <- table(as.character(newIndiv)) | 61 stat <- table(as.character(newIndiv)) |
| 61 major <- names(stat)[which.max(stat)] | 62 major <- names(stat)[which.max(stat)] |
| 63 # Encode using the appropriate code | |
| 62 indiv <- unlist(lapply(indiv, encodeGenotype.position, major, code, sep)) | 64 indiv <- unlist(lapply(indiv, encodeGenotype.position, major, code, sep)) |
| 63 return(indiv) | 65 return(indiv) |
| 64 } | 66 } |
| 65 | 67 |
| 66 | 68 # determine if the genotype is an homozygous or heterozygous one |
| 69 # genotype must be written with two characters, even homozygous | |
| 70 # (see encodeGenotype.rewrite() function ) | |
| 67 isHeterozygous <- function(genotype, sep=""){ | 71 isHeterozygous <- function(genotype, sep=""){ |
| 68 bool <- F | 72 bool <- F |
| 73 # case of NA, can't be determined | |
| 69 if(is.na(genotype)){ | 74 if(is.na(genotype)){ |
| 70 bool <- NA | 75 bool <- NA |
| 71 } else { | 76 } else { |
| 77 # check whether both element of the genotype are the same or not | |
| 72 x <- unlist(strsplit(genotype, sep)) | 78 x <- unlist(strsplit(genotype, sep)) |
| 73 if(length(x) > 1 & !(x[1] %in% x[2])) { | 79 if(length(x) > 1 & !(x[1] %in% x[2])) { |
| 74 bool <- T | 80 bool <- T |
| 75 } | 81 } |
| 76 | 82 |
| 77 } | 83 } |
| 78 return(bool) | 84 return(bool) |
| 79 } | 85 } |
| 80 | 86 |
| 87 # check if encoding has been made properly. return a boolean vector | |
| 88 # which has the same length that the number of columns of the input matrix | |
| 89 # at marker i, check[i] is true if code[3] is larger than code[1], false otherwise | |
| 90 # please note that this function is not used in the current tool and let | |
| 91 # by convinience for being used outside of galaxy | |
| 81 checkEncoding <- function(encoded, code=c(0,1,2)) { | 92 checkEncoding <- function(encoded, code=c(0,1,2)) { |
| 82 check <- NULL | 93 check <- NULL |
| 83 for(i in 1:ncol(encoded)) { | 94 for(i in 1:ncol(encoded)) { |
| 95 # find major an minor allele | |
| 84 major <- length(which(encoded[,i]==code[3])) | 96 major <- length(which(encoded[,i]==code[3])) |
| 85 minor <- length(which(encoded[,i]==code[1])) | 97 minor <- length(which(encoded[,i]==code[1])) |
| 98 # comaprison | |
| 86 if(major >= minor) { | 99 if(major >= minor) { |
| 87 check <- c(check, T) | 100 check <- c(check, T) |
| 88 } else { | 101 } else { |
| 89 check <- c(check, F) | 102 check <- c(check, F) |
| 90 } | 103 } |
| 92 return(check) | 105 return(check) |
| 93 } | 106 } |
| 94 | 107 |
| 95 ################################## main function ########################### | 108 ################################## main function ########################### |
| 96 # encode all individuals | 109 # encode all individuals |
| 97 # encode genotype into a {-1,0,1} scheme, where -1 = minor homozygous, 0 = heterozygous, 1 = major homozygous | |
| 98 encodeGenotype <- function(raw, sep="", code=c(0,1,2), outPath){ | 110 encodeGenotype <- function(raw, sep="", code=c(0,1,2), outPath){ |
| 111 # encode genotype | |
| 99 encoded <- apply(raw, 2, encodeGenotype.vec, sep, code) | 112 encoded <- apply(raw, 2, encodeGenotype.vec, sep, code) |
| 113 # set all NA to -1 (thus encoding schems with -1 are not allowed) | |
| 100 encoded[is.na(encoded)] <- -1 | 114 encoded[is.na(encoded)] <- -1 |
| 101 write.table(encoded, file=paste(outPath,".csv", sep=""), row.names = F, sep="\t") | 115 write.table(encoded, file=paste(outPath,".csv", sep=""), row.names = F, sep="\t") |
| 102 } | 116 } |
| 103 | 117 |
| 104 ############################ main ############################# | 118 ############################ main ############################# |
| 105 # running from terminal (supposing the OghmaGalaxy/bin directory is in your path) : | 119 # load argument from xml file |
| 106 # encode.sh -i path_to_raw_data -s separator -c code -o path_to_result_directory | |
| 107 ## -i : path to the file that contains the genotypes to encode, must be a .rda file (as outputed by loadGenotype.R). | |
| 108 # please note that the table must be called "genotype" when your datafile is saved into .rda (automatic if loadGenotype.R was used) | |
| 109 | |
| 110 ## -s : in case of heterozygous both allele are encoded in the same "cell" of the table and separated by a character | |
| 111 # (most often "" or "/"). This argument specify which character | |
| 112 | |
| 113 ## -c : the encoding of minor allele/heterozygous/major allele. by default {-1,0,1} | |
| 114 | |
| 115 ## -o : path to the file of encoded genotype. the .rda extension is automatically added | |
| 116 | |
| 117 cmd <- commandArgs(T) | 120 cmd <- commandArgs(T) |
| 118 source(cmd[1]) | 121 source(cmd[1]) |
| 122 # load genotype | |
| 119 genotype <- read.table(genotype, sep="\t", stringsAsFactors = F, h=T) | 123 genotype <- read.table(genotype, sep="\t", stringsAsFactors = F, h=T) |
| 120 # deal with optional argument | 124 code <- c(0,1,2) |
| 121 code <- strsplit(code, ",") | |
| 122 code <- unlist(lapply(code, as.numeric), use.names = F) | |
| 123 encodeGenotype(raw=genotype, sep=sep, code = code, outPath = out) | 125 encodeGenotype(raw=genotype, sep=sep, code = code, outPath = out) |
| 124 cat(paste(out,".csv", "\n", sep="")) | 126 cat(paste(out,".csv", "\n", sep="")) |
