Mercurial > repos > nicolas > oghma
changeset 2:f3f230290ffe draft
Uploaded
author | nicolas |
---|---|
date | Fri, 21 Oct 2016 06:24:57 -0400 |
parents | c8ba405ffd13 |
children | 0f87be78e151 |
files | encode.R |
diffstat | 1 files changed, 124 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/encode.R Fri Oct 21 06:24:57 2016 -0400 @@ -0,0 +1,124 @@ +######################################################## +# +# creation date : 04/01/16 +# last modification : 17/09/16 +# author : Dr Nicolas Beaume +# owner : IRRI +# +######################################################## + +log <- file(paste(getwd(), "log_encode.txt", sep="/"), open = "wt") +sink(file = log, type="message") + +############################ helper functions ####################### + +# encode one position in one individual +encodeGenotype.position <- function(x, major, code=c(0,1,2), sep=""){ + res <- x + if(!is.na(x)) { + if(isHeterozygous(x, sep = sep)) { + # heterozygous + res <- code[2] + } else { + # determine whether it is the minor or major allele + x <- unlist(strsplit(x, sep)) + # need to check only one element as we already know it is a homozygous + if(length(x) > 1) { + x <- x[1] + } + if(x==major) { + res <- code[3] + } else { + res <- code[1] + } + } + } else { + res <- NA + } + return(res) +} + +# rewrite a marker to determine the major allele +encodeGenotype.rewrite <- function(x, sep=""){ + res <- x + if(!is.na(x)) { + if(length(unlist(strsplit(x,sep)))==1) { + # in case of homozygous, must be counted 2 times + res <- c(x,x) + } else { + res <- unlist(strsplit(x, split=sep)) + } + } else { + res <- NA + } + return(res) +} + +# encode one individual +encodeGenotype.vec <- function(indiv, sep="", code=c(0,1,2)){ + newIndiv <- unlist(lapply(as.character(indiv), encodeGenotype.rewrite, sep)) + stat <- table(as.character(newIndiv)) + major <- names(stat)[which.max(stat)] + indiv <- unlist(lapply(indiv, encodeGenotype.position, major, code, sep)) + return(indiv) +} + + +isHeterozygous <- function(genotype, sep=""){ + bool <- F + if(is.na(genotype)){ + bool <- NA + } else { + x <- unlist(strsplit(genotype, sep)) + if(length(x) > 1 & !(x[1] %in% x[2])) { + bool <- T + } + + } + return(bool) +} + +checkEncoding <- function(encoded, code=c(0,1,2)) { + check <- NULL + for(i in 1:ncol(encoded)) { + major <- length(which(encoded[,i]==code[3])) + minor <- length(which(encoded[,i]==code[1])) + if(major >= minor) { + check <- c(check, T) + } else { + check <- c(check, F) + } + } + return(check) +} + +################################## main function ########################### +# encode all individuals +# encode genotype into a {-1,0,1} scheme, where -1 = minor homozygous, 0 = heterozygous, 1 = major homozygous +encodeGenotype <- function(raw, sep="", code=c(0,1,2), outPath){ + encoded <- apply(raw, 2, encodeGenotype.vec, sep, code) + encoded[is.na(encoded)] <- -1 + write.table(encoded, file=paste(outPath,".csv", sep=""), row.names = F, sep="\t") +} + +############################ main ############################# +# running from terminal (supposing the OghmaGalaxy/bin directory is in your path) : +# encode.sh -i path_to_raw_data -s separator -c code -o path_to_result_directory +## -i : path to the file that contains the genotypes to encode, must be a .rda file (as outputed by loadGenotype.R). +# please note that the table must be called "genotype" when your datafile is saved into .rda (automatic if loadGenotype.R was used) + +## -s : in case of heterozygous both allele are encoded in the same "cell" of the table and separated by a character +# (most often "" or "/"). This argument specify which character + +## -c : the encoding of minor allele/heterozygous/major allele. by default {-1,0,1} + +## -o : path to the file of encoded genotype. the .rda extension is automatically added + +cmd <- commandArgs(T) +source(cmd[1]) +genotype <- read.table(genotype, sep="\t", stringsAsFactors = F, h=T) +# deal with optional argument +code <- strsplit(code, ",") +code <- unlist(lapply(code, as.numeric), use.names = F) +encodeGenotype(raw=genotype, sep=sep, code = code, outPath = out) +cat(paste(out,".csv", "\n", sep="")) \ No newline at end of file