annotate encode.R @ 103:e7115e44d8d8 draft default tip

Uploaded
author nicolas
date Mon, 31 Oct 2016 07:20:49 -0400
parents ca6af7b12073
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
73
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
1 ########################################################
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
2 #
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
3 # creation date : 04/01/16
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
4 # last modification : 17/09/16
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
5 # author : Dr Nicolas Beaume
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
6 # owner : IRRI
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
7 #
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
8 ########################################################
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
9
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
10 ############################ helper functions #######################
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
11
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
12 # encode one position in one individual
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
13 encodeGenotype.position <- function(x, major, code=c(0,1,2), sep=""){
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
14 res <- x
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
15 if(!is.na(x)) {
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
16 if(isHeterozygous(x, sep = sep)) {
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
17 # heterozygous
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
18 res <- code[2]
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
19 } else {
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
20 # determine whether it is the minor or major allele
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
21 x <- unlist(strsplit(x, sep))
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
22 # need to check only one element as we already know it is a homozygous
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
23 if(length(x) > 1) {
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
24 x <- x[1]
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
25 }
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
26 if(x==major) {
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
27 res <- code[3]
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
28 } else {
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
29 res <- code[1]
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
30 }
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
31 }
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
32 } else {
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
33 # keep NA as NA
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
34 res <- NA
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
35 }
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
36 return(res)
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
37 }
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
38
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
39 # rewrite a marker to make an exact count of the allele for the current marker
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
40 encodeGenotype.rewrite <- function(x, sep=""){
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
41 res <- x
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
42 if(!is.na(x)) {
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
43 if(length(unlist(strsplit(x,sep)))==1) {
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
44 # in case of homozygous, must be counted 2 times so the caracter is written 2 times
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
45 res <- c(x,x)
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
46 } else {
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
47 # heterozygous
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
48 res <- unlist(strsplit(x, split=sep))
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
49 }
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
50 } else {
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
51 res <- NA
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
52 }
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
53 return(res)
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
54 }
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
55
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
56 # encode one individual
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
57 encodeGenotype.vec <- function(indiv, sep="", code=c(0,1,2)){
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
58 # rewrite genotype to make sure everything is written as a double caracter value
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
59 newIndiv <- unlist(lapply(as.character(indiv), encodeGenotype.rewrite, sep))
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
60 # compute the occurcence of each genotype to determine major an minor allele
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
61 stat <- table(as.character(newIndiv))
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
62 major <- names(stat)[which.max(stat)]
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
63 # Encode using the appropriate code
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
64 indiv <- unlist(lapply(indiv, encodeGenotype.position, major, code, sep))
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
65 return(indiv)
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
66 }
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
67
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
68 # determine if the genotype is an homozygous or heterozygous one
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
69 # genotype must be written with two characters, even homozygous
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
70 # (see encodeGenotype.rewrite() function )
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
71 isHeterozygous <- function(genotype, sep=""){
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
72 bool <- F
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
73 # case of NA, can't be determined
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
74 if(is.na(genotype)){
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
75 bool <- NA
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
76 } else {
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
77 # check whether both element of the genotype are the same or not
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
78 x <- unlist(strsplit(genotype, sep))
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
79 if(length(x) > 1 & !(x[1] %in% x[2])) {
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
80 bool <- T
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
81 }
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
82
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
83 }
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
84 return(bool)
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
85 }
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
86
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
87 # check if encoding has been made properly. return a boolean vector
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
88 # which has the same length that the number of columns of the input matrix
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
89 # at marker i, check[i] is true if code[3] is larger than code[1], false otherwise
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
90 # please note that this function is not used in the current tool and let
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
91 # by convinience for being used outside of galaxy
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
92 checkEncoding <- function(encoded, code=c(0,1,2)) {
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
93 check <- NULL
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
94 for(i in 1:ncol(encoded)) {
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
95 # find major an minor allele
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
96 major <- length(which(encoded[,i]==code[3]))
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
97 minor <- length(which(encoded[,i]==code[1]))
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
98 # comaprison
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
99 if(major >= minor) {
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
100 check <- c(check, T)
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
101 } else {
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
102 check <- c(check, F)
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
103 }
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
104 }
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
105 return(check)
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
106 }
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
107
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
108 ################################## main function ###########################
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
109 # encode all individuals
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
110 encodeGenotype <- function(raw, sep="", code=c(0,1,2), outPath){
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
111 # encode genotype
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
112 encoded <- apply(raw, 2, encodeGenotype.vec, sep, code)
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
113 # set all NA to -1 (thus encoding schems with -1 are not allowed)
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
114 encoded[is.na(encoded)] <- -1
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
115 write.table(encoded, file=paste(outPath,".csv", sep=""), row.names = F, sep="\t")
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
116 }
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
117
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
118 ############################ main #############################
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
119 # load argument from xml file
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
120 cmd <- commandArgs(T)
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
121 source(cmd[1])
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
122 # load genotype
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
123 genotype <- read.table(genotype, sep="\t", stringsAsFactors = F, h=T)
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
124 code <- c(0,1,2)
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
125 encodeGenotype(raw=genotype, sep=sep, code = code, outPath = out)
ca6af7b12073 Uploaded
nicolas
parents:
diff changeset
126 cat(paste(out,".csv", "\n", sep=""))