Mercurial > repos > nicolas > oghma
view encode.R @ 4:62e7a8d66b1f draft
Uploaded
author | nicolas |
---|---|
date | Fri, 21 Oct 2016 06:25:28 -0400 |
parents | f3f230290ffe |
children | 89175737f16b |
line wrap: on
line source
######################################################## # # creation date : 04/01/16 # last modification : 17/09/16 # author : Dr Nicolas Beaume # owner : IRRI # ######################################################## log <- file(paste(getwd(), "log_encode.txt", sep="/"), open = "wt") sink(file = log, type="message") ############################ helper functions ####################### # encode one position in one individual encodeGenotype.position <- function(x, major, code=c(0,1,2), sep=""){ res <- x if(!is.na(x)) { if(isHeterozygous(x, sep = sep)) { # heterozygous res <- code[2] } else { # determine whether it is the minor or major allele x <- unlist(strsplit(x, sep)) # need to check only one element as we already know it is a homozygous if(length(x) > 1) { x <- x[1] } if(x==major) { res <- code[3] } else { res <- code[1] } } } else { res <- NA } return(res) } # rewrite a marker to determine the major allele encodeGenotype.rewrite <- function(x, sep=""){ res <- x if(!is.na(x)) { if(length(unlist(strsplit(x,sep)))==1) { # in case of homozygous, must be counted 2 times res <- c(x,x) } else { res <- unlist(strsplit(x, split=sep)) } } else { res <- NA } return(res) } # encode one individual encodeGenotype.vec <- function(indiv, sep="", code=c(0,1,2)){ newIndiv <- unlist(lapply(as.character(indiv), encodeGenotype.rewrite, sep)) stat <- table(as.character(newIndiv)) major <- names(stat)[which.max(stat)] indiv <- unlist(lapply(indiv, encodeGenotype.position, major, code, sep)) return(indiv) } isHeterozygous <- function(genotype, sep=""){ bool <- F if(is.na(genotype)){ bool <- NA } else { x <- unlist(strsplit(genotype, sep)) if(length(x) > 1 & !(x[1] %in% x[2])) { bool <- T } } return(bool) } checkEncoding <- function(encoded, code=c(0,1,2)) { check <- NULL for(i in 1:ncol(encoded)) { major <- length(which(encoded[,i]==code[3])) minor <- length(which(encoded[,i]==code[1])) if(major >= minor) { check <- c(check, T) } else { check <- c(check, F) } } return(check) } ################################## main function ########################### # encode all individuals # encode genotype into a {-1,0,1} scheme, where -1 = minor homozygous, 0 = heterozygous, 1 = major homozygous encodeGenotype <- function(raw, sep="", code=c(0,1,2), outPath){ encoded <- apply(raw, 2, encodeGenotype.vec, sep, code) encoded[is.na(encoded)] <- -1 write.table(encoded, file=paste(outPath,".csv", sep=""), row.names = F, sep="\t") } ############################ main ############################# # running from terminal (supposing the OghmaGalaxy/bin directory is in your path) : # encode.sh -i path_to_raw_data -s separator -c code -o path_to_result_directory ## -i : path to the file that contains the genotypes to encode, must be a .rda file (as outputed by loadGenotype.R). # please note that the table must be called "genotype" when your datafile is saved into .rda (automatic if loadGenotype.R was used) ## -s : in case of heterozygous both allele are encoded in the same "cell" of the table and separated by a character # (most often "" or "/"). This argument specify which character ## -c : the encoding of minor allele/heterozygous/major allele. by default {-1,0,1} ## -o : path to the file of encoded genotype. the .rda extension is automatically added cmd <- commandArgs(T) source(cmd[1]) genotype <- read.table(genotype, sep="\t", stringsAsFactors = F, h=T) # deal with optional argument code <- strsplit(code, ",") code <- unlist(lapply(code, as.numeric), use.names = F) encodeGenotype(raw=genotype, sep=sep, code = code, outPath = out) cat(paste(out,".csv", "\n", sep=""))