2
|
1 ########################################################
|
|
2 #
|
|
3 # creation date : 04/01/16
|
|
4 # last modification : 17/09/16
|
|
5 # author : Dr Nicolas Beaume
|
|
6 # owner : IRRI
|
|
7 #
|
|
8 ########################################################
|
|
9
|
|
10 ############################ helper functions #######################
|
|
11
|
|
12 # encode one position in one individual
|
|
13 encodeGenotype.position <- function(x, major, code=c(0,1,2), sep=""){
|
|
14 res <- x
|
|
15 if(!is.na(x)) {
|
|
16 if(isHeterozygous(x, sep = sep)) {
|
|
17 # heterozygous
|
|
18 res <- code[2]
|
|
19 } else {
|
|
20 # determine whether it is the minor or major allele
|
|
21 x <- unlist(strsplit(x, sep))
|
|
22 # need to check only one element as we already know it is a homozygous
|
|
23 if(length(x) > 1) {
|
|
24 x <- x[1]
|
|
25 }
|
|
26 if(x==major) {
|
|
27 res <- code[3]
|
|
28 } else {
|
|
29 res <- code[1]
|
|
30 }
|
|
31 }
|
|
32 } else {
|
29
|
33 # keep NA as NA
|
2
|
34 res <- NA
|
|
35 }
|
|
36 return(res)
|
|
37 }
|
|
38
|
29
|
39 # rewrite a marker to make an exact count of the allele for the current marker
|
2
|
40 encodeGenotype.rewrite <- function(x, sep=""){
|
|
41 res <- x
|
|
42 if(!is.na(x)) {
|
|
43 if(length(unlist(strsplit(x,sep)))==1) {
|
29
|
44 # in case of homozygous, must be counted 2 times so the caracter is written 2 times
|
2
|
45 res <- c(x,x)
|
|
46 } else {
|
29
|
47 # heterozygous
|
2
|
48 res <- unlist(strsplit(x, split=sep))
|
|
49 }
|
|
50 } else {
|
|
51 res <- NA
|
|
52 }
|
|
53 return(res)
|
|
54 }
|
|
55
|
|
56 # encode one individual
|
|
57 encodeGenotype.vec <- function(indiv, sep="", code=c(0,1,2)){
|
29
|
58 # rewrite genotype to make sure everything is written as a double caracter value
|
2
|
59 newIndiv <- unlist(lapply(as.character(indiv), encodeGenotype.rewrite, sep))
|
29
|
60 # compute the occurcence of each genotype to determine major an minor allele
|
2
|
61 stat <- table(as.character(newIndiv))
|
|
62 major <- names(stat)[which.max(stat)]
|
29
|
63 # Encode using the appropriate code
|
2
|
64 indiv <- unlist(lapply(indiv, encodeGenotype.position, major, code, sep))
|
|
65 return(indiv)
|
|
66 }
|
|
67
|
29
|
68 # determine if the genotype is an homozygous or heterozygous one
|
|
69 # genotype must be written with two characters, even homozygous
|
|
70 # (see encodeGenotype.rewrite() function )
|
2
|
71 isHeterozygous <- function(genotype, sep=""){
|
|
72 bool <- F
|
29
|
73 # case of NA, can't be determined
|
2
|
74 if(is.na(genotype)){
|
|
75 bool <- NA
|
|
76 } else {
|
29
|
77 # check whether both element of the genotype are the same or not
|
2
|
78 x <- unlist(strsplit(genotype, sep))
|
|
79 if(length(x) > 1 & !(x[1] %in% x[2])) {
|
|
80 bool <- T
|
|
81 }
|
|
82
|
|
83 }
|
|
84 return(bool)
|
|
85 }
|
|
86
|
29
|
87 # check if encoding has been made properly. return a boolean vector
|
|
88 # which has the same length that the number of columns of the input matrix
|
|
89 # at marker i, check[i] is true if code[3] is larger than code[1], false otherwise
|
|
90 # please note that this function is not used in the current tool and let
|
|
91 # by convinience for being used outside of galaxy
|
2
|
92 checkEncoding <- function(encoded, code=c(0,1,2)) {
|
|
93 check <- NULL
|
|
94 for(i in 1:ncol(encoded)) {
|
29
|
95 # find major an minor allele
|
2
|
96 major <- length(which(encoded[,i]==code[3]))
|
|
97 minor <- length(which(encoded[,i]==code[1]))
|
29
|
98 # comaprison
|
2
|
99 if(major >= minor) {
|
|
100 check <- c(check, T)
|
|
101 } else {
|
|
102 check <- c(check, F)
|
|
103 }
|
|
104 }
|
|
105 return(check)
|
|
106 }
|
|
107
|
|
108 ################################## main function ###########################
|
|
109 # encode all individuals
|
|
110 encodeGenotype <- function(raw, sep="", code=c(0,1,2), outPath){
|
29
|
111 # encode genotype
|
2
|
112 encoded <- apply(raw, 2, encodeGenotype.vec, sep, code)
|
29
|
113 # set all NA to -1 (thus encoding schems with -1 are not allowed)
|
2
|
114 encoded[is.na(encoded)] <- -1
|
|
115 write.table(encoded, file=paste(outPath,".csv", sep=""), row.names = F, sep="\t")
|
|
116 }
|
|
117
|
|
118 ############################ main #############################
|
29
|
119 # load argument from xml file
|
2
|
120 cmd <- commandArgs(T)
|
|
121 source(cmd[1])
|
29
|
122 # load genotype
|
2
|
123 genotype <- read.table(genotype, sep="\t", stringsAsFactors = F, h=T)
|
29
|
124 code <- c(0,1,2)
|
2
|
125 encodeGenotype(raw=genotype, sep=sep, code = code, outPath = out)
|
|
126 cat(paste(out,".csv", "\n", sep="")) |