annotate encode.R @ 22:f0d89ff35ad2 draft

Uploaded
author nicolas
date Fri, 21 Oct 2016 10:35:13 -0400
parents f3f230290ffe
children 89175737f16b
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
1 ########################################################
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
2 #
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
3 # creation date : 04/01/16
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
4 # last modification : 17/09/16
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
5 # author : Dr Nicolas Beaume
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
6 # owner : IRRI
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
7 #
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
8 ########################################################
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
9
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
10 log <- file(paste(getwd(), "log_encode.txt", sep="/"), open = "wt")
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
11 sink(file = log, type="message")
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
12
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
13 ############################ helper functions #######################
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
14
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
15 # encode one position in one individual
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
16 encodeGenotype.position <- function(x, major, code=c(0,1,2), sep=""){
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
17 res <- x
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
18 if(!is.na(x)) {
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
19 if(isHeterozygous(x, sep = sep)) {
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
20 # heterozygous
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
21 res <- code[2]
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
22 } else {
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
23 # determine whether it is the minor or major allele
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
24 x <- unlist(strsplit(x, sep))
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
25 # need to check only one element as we already know it is a homozygous
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
26 if(length(x) > 1) {
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
27 x <- x[1]
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
28 }
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
29 if(x==major) {
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
30 res <- code[3]
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
31 } else {
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
32 res <- code[1]
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
33 }
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
34 }
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
35 } else {
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
36 res <- NA
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
37 }
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
38 return(res)
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
39 }
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
40
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
41 # rewrite a marker to determine the major allele
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
42 encodeGenotype.rewrite <- function(x, sep=""){
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
43 res <- x
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
44 if(!is.na(x)) {
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
45 if(length(unlist(strsplit(x,sep)))==1) {
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
46 # in case of homozygous, must be counted 2 times
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
47 res <- c(x,x)
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
48 } else {
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
49 res <- unlist(strsplit(x, split=sep))
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
50 }
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
51 } else {
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
52 res <- NA
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
53 }
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
54 return(res)
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
55 }
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
56
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
57 # encode one individual
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
58 encodeGenotype.vec <- function(indiv, sep="", code=c(0,1,2)){
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
59 newIndiv <- unlist(lapply(as.character(indiv), encodeGenotype.rewrite, sep))
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
60 stat <- table(as.character(newIndiv))
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
61 major <- names(stat)[which.max(stat)]
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
62 indiv <- unlist(lapply(indiv, encodeGenotype.position, major, code, sep))
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
63 return(indiv)
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
64 }
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
65
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
66
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
67 isHeterozygous <- function(genotype, sep=""){
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
68 bool <- F
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
69 if(is.na(genotype)){
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
70 bool <- NA
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
71 } else {
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
72 x <- unlist(strsplit(genotype, sep))
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
73 if(length(x) > 1 & !(x[1] %in% x[2])) {
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
74 bool <- T
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
75 }
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
76
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
77 }
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
78 return(bool)
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
79 }
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
80
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
81 checkEncoding <- function(encoded, code=c(0,1,2)) {
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
82 check <- NULL
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
83 for(i in 1:ncol(encoded)) {
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
84 major <- length(which(encoded[,i]==code[3]))
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
85 minor <- length(which(encoded[,i]==code[1]))
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
86 if(major >= minor) {
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
87 check <- c(check, T)
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
88 } else {
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
89 check <- c(check, F)
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
90 }
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
91 }
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
92 return(check)
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
93 }
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
94
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
95 ################################## main function ###########################
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
96 # encode all individuals
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
97 # encode genotype into a {-1,0,1} scheme, where -1 = minor homozygous, 0 = heterozygous, 1 = major homozygous
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
98 encodeGenotype <- function(raw, sep="", code=c(0,1,2), outPath){
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
99 encoded <- apply(raw, 2, encodeGenotype.vec, sep, code)
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
100 encoded[is.na(encoded)] <- -1
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
101 write.table(encoded, file=paste(outPath,".csv", sep=""), row.names = F, sep="\t")
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
102 }
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
103
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
104 ############################ main #############################
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
105 # running from terminal (supposing the OghmaGalaxy/bin directory is in your path) :
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
106 # encode.sh -i path_to_raw_data -s separator -c code -o path_to_result_directory
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
107 ## -i : path to the file that contains the genotypes to encode, must be a .rda file (as outputed by loadGenotype.R).
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
108 # please note that the table must be called "genotype" when your datafile is saved into .rda (automatic if loadGenotype.R was used)
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
109
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
110 ## -s : in case of heterozygous both allele are encoded in the same "cell" of the table and separated by a character
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
111 # (most often "" or "/"). This argument specify which character
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
112
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
113 ## -c : the encoding of minor allele/heterozygous/major allele. by default {-1,0,1}
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
114
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
115 ## -o : path to the file of encoded genotype. the .rda extension is automatically added
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
116
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
117 cmd <- commandArgs(T)
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
118 source(cmd[1])
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
119 genotype <- read.table(genotype, sep="\t", stringsAsFactors = F, h=T)
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
120 # deal with optional argument
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
121 code <- strsplit(code, ",")
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
122 code <- unlist(lapply(code, as.numeric), use.names = F)
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
123 encodeGenotype(raw=genotype, sep=sep, code = code, outPath = out)
f3f230290ffe Uploaded
nicolas
parents:
diff changeset
124 cat(paste(out,".csv", "\n", sep=""))