Mercurial > repos > nicolas > oghma
comparison encode.R @ 2:f3f230290ffe draft
Uploaded
author | nicolas |
---|---|
date | Fri, 21 Oct 2016 06:24:57 -0400 |
parents | |
children | 89175737f16b |
comparison
equal
deleted
inserted
replaced
1:c8ba405ffd13 | 2:f3f230290ffe |
---|---|
1 ######################################################## | |
2 # | |
3 # creation date : 04/01/16 | |
4 # last modification : 17/09/16 | |
5 # author : Dr Nicolas Beaume | |
6 # owner : IRRI | |
7 # | |
8 ######################################################## | |
9 | |
10 log <- file(paste(getwd(), "log_encode.txt", sep="/"), open = "wt") | |
11 sink(file = log, type="message") | |
12 | |
13 ############################ helper functions ####################### | |
14 | |
15 # encode one position in one individual | |
16 encodeGenotype.position <- function(x, major, code=c(0,1,2), sep=""){ | |
17 res <- x | |
18 if(!is.na(x)) { | |
19 if(isHeterozygous(x, sep = sep)) { | |
20 # heterozygous | |
21 res <- code[2] | |
22 } else { | |
23 # determine whether it is the minor or major allele | |
24 x <- unlist(strsplit(x, sep)) | |
25 # need to check only one element as we already know it is a homozygous | |
26 if(length(x) > 1) { | |
27 x <- x[1] | |
28 } | |
29 if(x==major) { | |
30 res <- code[3] | |
31 } else { | |
32 res <- code[1] | |
33 } | |
34 } | |
35 } else { | |
36 res <- NA | |
37 } | |
38 return(res) | |
39 } | |
40 | |
41 # rewrite a marker to determine the major allele | |
42 encodeGenotype.rewrite <- function(x, sep=""){ | |
43 res <- x | |
44 if(!is.na(x)) { | |
45 if(length(unlist(strsplit(x,sep)))==1) { | |
46 # in case of homozygous, must be counted 2 times | |
47 res <- c(x,x) | |
48 } else { | |
49 res <- unlist(strsplit(x, split=sep)) | |
50 } | |
51 } else { | |
52 res <- NA | |
53 } | |
54 return(res) | |
55 } | |
56 | |
57 # encode one individual | |
58 encodeGenotype.vec <- function(indiv, sep="", code=c(0,1,2)){ | |
59 newIndiv <- unlist(lapply(as.character(indiv), encodeGenotype.rewrite, sep)) | |
60 stat <- table(as.character(newIndiv)) | |
61 major <- names(stat)[which.max(stat)] | |
62 indiv <- unlist(lapply(indiv, encodeGenotype.position, major, code, sep)) | |
63 return(indiv) | |
64 } | |
65 | |
66 | |
67 isHeterozygous <- function(genotype, sep=""){ | |
68 bool <- F | |
69 if(is.na(genotype)){ | |
70 bool <- NA | |
71 } else { | |
72 x <- unlist(strsplit(genotype, sep)) | |
73 if(length(x) > 1 & !(x[1] %in% x[2])) { | |
74 bool <- T | |
75 } | |
76 | |
77 } | |
78 return(bool) | |
79 } | |
80 | |
81 checkEncoding <- function(encoded, code=c(0,1,2)) { | |
82 check <- NULL | |
83 for(i in 1:ncol(encoded)) { | |
84 major <- length(which(encoded[,i]==code[3])) | |
85 minor <- length(which(encoded[,i]==code[1])) | |
86 if(major >= minor) { | |
87 check <- c(check, T) | |
88 } else { | |
89 check <- c(check, F) | |
90 } | |
91 } | |
92 return(check) | |
93 } | |
94 | |
95 ################################## main function ########################### | |
96 # encode all individuals | |
97 # encode genotype into a {-1,0,1} scheme, where -1 = minor homozygous, 0 = heterozygous, 1 = major homozygous | |
98 encodeGenotype <- function(raw, sep="", code=c(0,1,2), outPath){ | |
99 encoded <- apply(raw, 2, encodeGenotype.vec, sep, code) | |
100 encoded[is.na(encoded)] <- -1 | |
101 write.table(encoded, file=paste(outPath,".csv", sep=""), row.names = F, sep="\t") | |
102 } | |
103 | |
104 ############################ main ############################# | |
105 # running from terminal (supposing the OghmaGalaxy/bin directory is in your path) : | |
106 # encode.sh -i path_to_raw_data -s separator -c code -o path_to_result_directory | |
107 ## -i : path to the file that contains the genotypes to encode, must be a .rda file (as outputed by loadGenotype.R). | |
108 # please note that the table must be called "genotype" when your datafile is saved into .rda (automatic if loadGenotype.R was used) | |
109 | |
110 ## -s : in case of heterozygous both allele are encoded in the same "cell" of the table and separated by a character | |
111 # (most often "" or "/"). This argument specify which character | |
112 | |
113 ## -c : the encoding of minor allele/heterozygous/major allele. by default {-1,0,1} | |
114 | |
115 ## -o : path to the file of encoded genotype. the .rda extension is automatically added | |
116 | |
117 cmd <- commandArgs(T) | |
118 source(cmd[1]) | |
119 genotype <- read.table(genotype, sep="\t", stringsAsFactors = F, h=T) | |
120 # deal with optional argument | |
121 code <- strsplit(code, ",") | |
122 code <- unlist(lapply(code, as.numeric), use.names = F) | |
123 encodeGenotype(raw=genotype, sep=sep, code = code, outPath = out) | |
124 cat(paste(out,".csv", "\n", sep="")) |