comparison encode.R @ 29:89175737f16b draft

Uploaded
author nicolas
date Tue, 25 Oct 2016 14:38:58 -0400
parents f3f230290ffe
children
comparison
equal deleted inserted replaced
28:40664d2d295f 29:89175737f16b
4 # last modification : 17/09/16 4 # last modification : 17/09/16
5 # author : Dr Nicolas Beaume 5 # author : Dr Nicolas Beaume
6 # owner : IRRI 6 # owner : IRRI
7 # 7 #
8 ######################################################## 8 ########################################################
9
10 log <- file(paste(getwd(), "log_encode.txt", sep="/"), open = "wt")
11 sink(file = log, type="message")
12 9
13 ############################ helper functions ####################### 10 ############################ helper functions #######################
14 11
15 # encode one position in one individual 12 # encode one position in one individual
16 encodeGenotype.position <- function(x, major, code=c(0,1,2), sep=""){ 13 encodeGenotype.position <- function(x, major, code=c(0,1,2), sep=""){
31 } else { 28 } else {
32 res <- code[1] 29 res <- code[1]
33 } 30 }
34 } 31 }
35 } else { 32 } else {
33 # keep NA as NA
36 res <- NA 34 res <- NA
37 } 35 }
38 return(res) 36 return(res)
39 } 37 }
40 38
41 # rewrite a marker to determine the major allele 39 # rewrite a marker to make an exact count of the allele for the current marker
42 encodeGenotype.rewrite <- function(x, sep=""){ 40 encodeGenotype.rewrite <- function(x, sep=""){
43 res <- x 41 res <- x
44 if(!is.na(x)) { 42 if(!is.na(x)) {
45 if(length(unlist(strsplit(x,sep)))==1) { 43 if(length(unlist(strsplit(x,sep)))==1) {
46 # in case of homozygous, must be counted 2 times 44 # in case of homozygous, must be counted 2 times so the caracter is written 2 times
47 res <- c(x,x) 45 res <- c(x,x)
48 } else { 46 } else {
47 # heterozygous
49 res <- unlist(strsplit(x, split=sep)) 48 res <- unlist(strsplit(x, split=sep))
50 } 49 }
51 } else { 50 } else {
52 res <- NA 51 res <- NA
53 } 52 }
54 return(res) 53 return(res)
55 } 54 }
56 55
57 # encode one individual 56 # encode one individual
58 encodeGenotype.vec <- function(indiv, sep="", code=c(0,1,2)){ 57 encodeGenotype.vec <- function(indiv, sep="", code=c(0,1,2)){
58 # rewrite genotype to make sure everything is written as a double caracter value
59 newIndiv <- unlist(lapply(as.character(indiv), encodeGenotype.rewrite, sep)) 59 newIndiv <- unlist(lapply(as.character(indiv), encodeGenotype.rewrite, sep))
60 # compute the occurcence of each genotype to determine major an minor allele
60 stat <- table(as.character(newIndiv)) 61 stat <- table(as.character(newIndiv))
61 major <- names(stat)[which.max(stat)] 62 major <- names(stat)[which.max(stat)]
63 # Encode using the appropriate code
62 indiv <- unlist(lapply(indiv, encodeGenotype.position, major, code, sep)) 64 indiv <- unlist(lapply(indiv, encodeGenotype.position, major, code, sep))
63 return(indiv) 65 return(indiv)
64 } 66 }
65 67
66 68 # determine if the genotype is an homozygous or heterozygous one
69 # genotype must be written with two characters, even homozygous
70 # (see encodeGenotype.rewrite() function )
67 isHeterozygous <- function(genotype, sep=""){ 71 isHeterozygous <- function(genotype, sep=""){
68 bool <- F 72 bool <- F
73 # case of NA, can't be determined
69 if(is.na(genotype)){ 74 if(is.na(genotype)){
70 bool <- NA 75 bool <- NA
71 } else { 76 } else {
77 # check whether both element of the genotype are the same or not
72 x <- unlist(strsplit(genotype, sep)) 78 x <- unlist(strsplit(genotype, sep))
73 if(length(x) > 1 & !(x[1] %in% x[2])) { 79 if(length(x) > 1 & !(x[1] %in% x[2])) {
74 bool <- T 80 bool <- T
75 } 81 }
76 82
77 } 83 }
78 return(bool) 84 return(bool)
79 } 85 }
80 86
87 # check if encoding has been made properly. return a boolean vector
88 # which has the same length that the number of columns of the input matrix
89 # at marker i, check[i] is true if code[3] is larger than code[1], false otherwise
90 # please note that this function is not used in the current tool and let
91 # by convinience for being used outside of galaxy
81 checkEncoding <- function(encoded, code=c(0,1,2)) { 92 checkEncoding <- function(encoded, code=c(0,1,2)) {
82 check <- NULL 93 check <- NULL
83 for(i in 1:ncol(encoded)) { 94 for(i in 1:ncol(encoded)) {
95 # find major an minor allele
84 major <- length(which(encoded[,i]==code[3])) 96 major <- length(which(encoded[,i]==code[3]))
85 minor <- length(which(encoded[,i]==code[1])) 97 minor <- length(which(encoded[,i]==code[1]))
98 # comaprison
86 if(major >= minor) { 99 if(major >= minor) {
87 check <- c(check, T) 100 check <- c(check, T)
88 } else { 101 } else {
89 check <- c(check, F) 102 check <- c(check, F)
90 } 103 }
92 return(check) 105 return(check)
93 } 106 }
94 107
95 ################################## main function ########################### 108 ################################## main function ###########################
96 # encode all individuals 109 # encode all individuals
97 # encode genotype into a {-1,0,1} scheme, where -1 = minor homozygous, 0 = heterozygous, 1 = major homozygous
98 encodeGenotype <- function(raw, sep="", code=c(0,1,2), outPath){ 110 encodeGenotype <- function(raw, sep="", code=c(0,1,2), outPath){
111 # encode genotype
99 encoded <- apply(raw, 2, encodeGenotype.vec, sep, code) 112 encoded <- apply(raw, 2, encodeGenotype.vec, sep, code)
113 # set all NA to -1 (thus encoding schems with -1 are not allowed)
100 encoded[is.na(encoded)] <- -1 114 encoded[is.na(encoded)] <- -1
101 write.table(encoded, file=paste(outPath,".csv", sep=""), row.names = F, sep="\t") 115 write.table(encoded, file=paste(outPath,".csv", sep=""), row.names = F, sep="\t")
102 } 116 }
103 117
104 ############################ main ############################# 118 ############################ main #############################
105 # running from terminal (supposing the OghmaGalaxy/bin directory is in your path) : 119 # load argument from xml file
106 # encode.sh -i path_to_raw_data -s separator -c code -o path_to_result_directory
107 ## -i : path to the file that contains the genotypes to encode, must be a .rda file (as outputed by loadGenotype.R).
108 # please note that the table must be called "genotype" when your datafile is saved into .rda (automatic if loadGenotype.R was used)
109
110 ## -s : in case of heterozygous both allele are encoded in the same "cell" of the table and separated by a character
111 # (most often "" or "/"). This argument specify which character
112
113 ## -c : the encoding of minor allele/heterozygous/major allele. by default {-1,0,1}
114
115 ## -o : path to the file of encoded genotype. the .rda extension is automatically added
116
117 cmd <- commandArgs(T) 120 cmd <- commandArgs(T)
118 source(cmd[1]) 121 source(cmd[1])
122 # load genotype
119 genotype <- read.table(genotype, sep="\t", stringsAsFactors = F, h=T) 123 genotype <- read.table(genotype, sep="\t", stringsAsFactors = F, h=T)
120 # deal with optional argument 124 code <- c(0,1,2)
121 code <- strsplit(code, ",")
122 code <- unlist(lapply(code, as.numeric), use.names = F)
123 encodeGenotype(raw=genotype, sep=sep, code = code, outPath = out) 125 encodeGenotype(raw=genotype, sep=sep, code = code, outPath = out)
124 cat(paste(out,".csv", "\n", sep="")) 126 cat(paste(out,".csv", "\n", sep=""))